librarian/pypdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from copy import deepcopy
  13 import os.path
  14 import shutil
  15 import re
  16 import random
  17 from urllib2 import urlopen
  18
  19 from lxml import etree
  20
  21 from xmlutils import Xmill, tag, tagged, ifoption, tag_open_close
  22 from librarian.dcparser import Person
  23 from librarian import DCNS, get_resource, IOFile
  24 from librarian import functions
  25 from pdf import PDFFormat, substitute_hyphens, fix_hanging
  26
  27
  28 def escape(really):
  29     def deco(f):
  30         def _wrap(*args, **kw):
  31             value = f(*args, **kw)
  32
  33             prefix = (u'<TeXML escape="%d">' % (really and 1 or 0))
  34             postfix = u'</TeXML>'
  35             if isinstance(value, list):
  36                 import pdb; pdb.set_trace()
  37             if isinstance(value, tuple):
  38                 return prefix + value[0], value[1] + postfix
  39             else:
  40                 return prefix + value + postfix
  41         return _wrap
  42     return deco
  43
  44
  45 def cmd(name, parms=None):
  46     def wrap(self, element=None):
  47         pre, post = tag_open_close('cmd', name=name)
  48
  49         if parms:
  50             for parm in parms:
  51                 e = etree.Element("parm")
  52                 e.text = parm
  53                 pre += etree.tostring(e)
  54         if element is not None:
  55             pre += "<parm>"
  56             post = "</parm>" + post
  57             return pre, post
  58         else:
  59             return pre + post
  60     return wrap
  61
  62
  63 def mark_alien_characters(text):
  64     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  65     return text
  66
  67
  68 class EduModule(Xmill):
  69     def __init__(self, options=None, state=None):
  70         super(EduModule, self).__init__(options, state)
  71         self.activity_counter = 0
  72         self.activity_last = None
  73         self.exercise_counter = 0
  74
  75         def swap_endlines(txt):
  76             if self.options['strofa']:
  77                 txt = txt.replace("/\n", '<ctrl ch="\\"/>')
  78             return txt
  79         self.register_text_filter(swap_endlines)
  80         self.register_text_filter(functions.substitute_entities)
  81         self.register_text_filter(mark_alien_characters)
  82
  83     def get_dc(self, element, dc_field, single=False):
  84         values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri}))
  85         if single:
  86             return values[0]
  87         return values
  88
  89     def handle_rdf__RDF(self, _):
  90         "skip metadata in generation"
  91         return
  92
  93     @escape(True)
  94     def get_rightsinfo(self, element):
  95         rights_lic = self.get_dc(element, 'rights.license', True)
  96         return u'<cmd name="rightsinfostr">' + \
  97           (rights_lic and u'<opt>%s</opt>' % rights_lic or '') +\
  98           u'<parm>%s</parm>' % self.get_dc(element, 'rights', True) +\
  99           u'</cmd>'
 100
 101     @escape(True)
 102     def get_authors(self, element, which=None):
 103         dc = self.options['wldoc'].book_info
 104         if which is None:
 105             authors = dc.authors_textbook + \
 106                 dc.authors_scenario + \
 107                 dc.authors_expert
 108         else:
 109             authors = getattr(dc, "authors_%s" % which)
 110         return u', '.join(author.readable() for author in authors if author)
 111
 112     @escape(1)
 113     def get_title(self, element):
 114         return self.get_dc(element, 'title', True)
 115
 116     def handle_utwor(self, element):
 117         lines = [
 118             u'''
 119     <TeXML xmlns="http://getfo.sourceforge.net/texml/ns1">
 120         <TeXML escape="0">
 121         \\documentclass[%s]{wl}
 122         \\usepackage{style}''' % self.options['customization_str'],
 123     self.options['has_cover'] and '\usepackage{makecover}',
 124     (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or
 125     (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or
 126     (self.options['morefloats'] == 'none' and
 127      u'''\\IfFileExists{morefloats.sty}{
 128             \\usepackage{morefloats}
 129         }{}'''),
 130     u'''\\def\\authors{%s}''' % self.get_authors(element),
 131     u'''\\def\\authorsexpert{%s}''' % self.get_authors(element, 'expert'),
 132     u'''\\def\\authorsscenario{%s}''' % self.get_authors(element, 'scenario'),
 133     u'''\\def\\authorstextbook{%s}''' % self.get_authors(element, 'textbook'),
 134
 135     u'''\\author{\\authors}''',
 136     u'''\\title{%s}''' % self.get_title(element),
 137     u'''\\def\\bookurl{%s}''' % self.options['wldoc'].book_info.url.canonical(),
 138     u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element),
 139     u'</TeXML>']
 140
 141         return u"".join(filter(None, lines)), u'</TeXML>'
 142
 143
 144     @escape(1)
 145     def handle_powiesc(self, element):
 146         return u"""
 147     <env name="document">
 148     <cmd name="maketitle"/>
 149     """, """<cmd name="editorialsection" /></env>"""
 150
 151     @escape(1)
 152     def handle_texcommand(self, element):
 153         cmd = functions.texcommand(element.tag)
 154         return u'<TeXML escape="1"><cmd name="%s"><parm>' % cmd, u'</parm></cmd></TeXML>'
 155
 156     handle_akap = \
 157     handle_akap = \
 158     handle_akap_cd = \
 159     handle_akap_cd = \
 160     handle_akap_dialog = \
 161     handle_akap_dialog = \
 162     handle_autor_utworu = \
 163     handle_dedykacja = \
 164     handle_didaskalia = \
 165     handle_didask_tekst = \
 166     handle_dlugi_cytat = \
 167     handle_dzielo_nadrzedne = \
 168     handle_lista_osoba = \
 169     handle_mat = \
 170     handle_miejsce_czas = \
 171     handle_motto = \
 172     handle_motto_podpis = \
 173     handle_naglowek_akt = \
 174     handle_naglowek_czesc = \
 175     handle_naglowek_listy = \
 176     handle_naglowek_osoba = \
 177     handle_naglowek_scena = \
 178     handle_nazwa_utworu = \
 179     handle_nota = \
 180     handle_osoba = \
 181     handle_pa = \
 182     handle_pe = \
 183     handle_podtytul = \
 184     handle_poezja_cyt = \
 185     handle_pr = \
 186     handle_pt = \
 187     handle_sekcja_asterysk = \
 188     handle_sekcja_swiatlo = \
 189     handle_separator_linia = \
 190     handle_slowo_obce = \
 191     handle_srodtytul = \
 192     handle_tytul_dziela = \
 193     handle_wyroznienie = \
 194     handle_dywiz = \
 195     handle_texcommand
 196
 197     def handle_naglowek_rozdzial(self, element):
 198         if not self.options['teacher']:
 199             if element.text.startswith((u'Wiedza', u'Zadania', u'Słowniczek', u'Dla ucznia')):
 200                 self.state['mute'] = False
 201             else:
 202                 self.state['mute'] = True
 203                 return None
 204         return self.handle_texcommand(element)
 205     handle_naglowek_rozdzial.unmuter = True
 206
 207     def handle_naglowek_podrozdzial(self, element):
 208         self.activity_counter = 0
 209         if not self.options['teacher']:
 210             if element.text.startswith(u'Dla ucznia'):
 211                 self.state['mute'] = False
 212                 return None
 213             elif element.text.startswith(u'Dla nauczyciela'):
 214                 self.state['mute'] = True
 215                 return None
 216             elif self.state['mute']:
 217                 return None
 218         return self.handle_texcommand(element)
 219     handle_naglowek_podrozdzial.unmuter = True
 220
 221     def handle_uwaga(self, _e):
 222         return None
 223     def handle_extra(self, _e):
 224         return None
 225
 226     def handle_nbsp(self, _e):
 227         return '<spec cat="tilde" />'
 228
 229     _handle_strofa = cmd("strofa")
 230
 231     def handle_strofa(self, element):
 232         self.options = {'strofa': True}
 233         return self._handle_strofa(element)
 234
 235     def handle_aktywnosc(self, element):
 236         self.activity_counter += 1
 237         self.options = {
 238             'activity': True,
 239             'activity_counter': self.activity_counter,
 240             'sub_gen': True,
 241         }
 242         submill = EduModule(self.options, self.state)
 243
 244         if element.xpath('opis'):
 245             opis = submill.generate(element.xpath('opis')[0])
 246         else:
 247             opis = ''
 248
 249         n = element.xpath('wskazowki')
 250         if n: wskazowki = submill.generate(n[0])
 251
 252         else: wskazowki = ''
 253         n = element.xpath('pomoce')
 254
 255         if n: pomoce = submill.generate(n[0])
 256         else: pomoce = ''
 257
 258         forma = ''.join(element.xpath('forma/text()'))
 259
 260         czas = ''.join(element.xpath('czas/text()'))
 261
 262         counter = self.activity_counter
 263
 264         if element.getnext().tag == 'aktywnosc' or self.activity_last.getnext() == element:
 265             counter_tex = """<cmd name="activitycounter"><parm>%(counter)d.</parm></cmd>""" % locals()
 266         else:
 267             counter_tex = ''
 268
 269         self.activity_last = element
 270
 271         return u"""
 272 <cmd name="noindent" />
 273 %(counter_tex)s
 274 <cmd name="activityinfo"><parm>
 275  <cmd name="activitytime"><parm>%(czas)s</parm></cmd>
 276  <cmd name="activityform"><parm>%(forma)s</parm></cmd>
 277  <cmd name="activitytools"><parm>%(pomoce)s</parm></cmd>
 278 </parm></cmd>
 279
 280
 281 %(opis)s
 282
 283 %(wskazowki)s
 284 """ % locals()
 285
 286     handle_opis = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 287     handle_wskazowki = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 288
 289     @ifoption(sub_gen=True)
 290     def handle_pomoce(self, _):
 291         return "Pomoce: ", ""
 292
 293     def handle_czas(self, *_):
 294         return
 295
 296     def handle_forma(self, *_):
 297         return
 298
 299     def handle_lista(self, element, attrs={}):
 300         ltype = element.attrib.get('typ', 'punkt')
 301         if not element.findall("punkt"):
 302             if ltype == 'czytelnia':
 303                 return 'W przygotowaniu.'
 304             else:
 305                 return None
 306         if ltype == 'slowniczek':
 307             surl = element.attrib.get('src', None)
 308             if surl is None:
 309                 # print '** missing src on <slowniczek>, setting default'
 310                 surl = 'http://edukacjamedialna.edu.pl/lekcje/slowniczek/'
 311             sxml = etree.fromstring(self.options['wldoc'].provider.by_uri(surl).get_string())
 312             self.options = {'slowniczek': True, 'slowniczek_xml': sxml }
 313
 314         listcmd = {'num': 'enumerate',
 315                'punkt': 'itemize',
 316                'alfa': 'itemize',
 317                'slowniczek': 'itemize',
 318                'czytelnia': 'itemize'}[ltype]
 319
 320         return u'<env name="%s">' % listcmd, u'</env>'
 321
 322     def handle_punkt(self, element):
 323         return '<cmd name="item"/>', ''
 324
 325     def handle_cwiczenie(self, element):
 326         exercise_handlers = {
 327             'wybor': Wybor,
 328             'uporzadkuj': Uporzadkuj,
 329             'luki': Luki,
 330             'zastap': Zastap,
 331             'przyporzadkuj': Przyporzadkuj,
 332             'prawdafalsz': PrawdaFalsz
 333         }
 334
 335         typ = element.attrib['typ']
 336         self.exercise_counter += 1
 337         if not typ in exercise_handlers:
 338             return '(no handler)'
 339         self.options = {'exercise_counter': self.exercise_counter}
 340         handler = exercise_handlers[typ](self.options, self.state)
 341         return handler.generate(element)
 342
 343     # XXX this is copied from pyhtml.py, except for return and
 344     # should be refactored for no code duplication
 345     def handle_definiendum(self, element):
 346         nxt = element.getnext()
 347         definiens_s = ''
 348
 349         # let's pull definiens from another document
 350         if self.options['slowniczek_xml'] is not None and (nxt is None or nxt.tag != 'definiens'):
 351             sxml = self.options['slowniczek_xml']
 352             assert element.text != ''
 353             defloc = sxml.xpath("//definiendum[text()='%s']" % element.text)
 354             if defloc:
 355                 definiens = defloc[0].getnext()
 356                 if definiens.tag == 'definiens':
 357                     subgen = EduModule(self.options, self.state)
 358                     definiens_s = subgen.generate(definiens)
 359
 360         return u'<cmd name="textbf"><parm>', u"</parm></cmd>: " + definiens_s
 361
 362     def handle_definiens(self, element):
 363         return u"", u""
 364
 365     def handle_podpis(self, element):
 366         return u"""<env name="figure">""", u"</env>"
 367
 368     def handle_tabela(self, element):
 369         max_col = 0
 370         for w in element.xpath("wiersz"):
 371             ks = w.xpath("kol")
 372             if max_col < len(ks):
 373                 max_col = len(ks)
 374         self.options = {'columnts': max_col}
 375         # styling:
 376                 #        has_frames = int(element.attrib.get("ramki", "0"))
 377                 #        if has_frames: frames_c = "framed"
 378                 #        else: frames_c = ""
 379                 #        return u"""<table class="%s">""" % frames_c, u"</table>"
 380         return u'''
 381 <cmd name="begin"><parm>tabular</parm><parm>%s</parm></cmd>
 382     ''' % ('l' * max_col), \
 383     u'''<cmd name="end"><parm>tabular</parm></cmd>'''
 384
 385     @escape(1)
 386     def handle_wiersz(self, element):
 387         return u"", u'<ctrl ch="\\"/>'
 388
 389     @escape(1)
 390     def handle_kol(self, element):
 391         if element.getnext() is not None:
 392             return u"", u'<spec cat="align" />'
 393         return u"", u""
 394
 395     def handle_link(self, element):
 396         if element.attrib.get('url'):
 397             url = element.attrib.get('url')
 398             if url == element.text:
 399                 return cmd('url')(self, element)
 400             else:
 401                 return cmd('href', parms=[element.attrib['url']])(self, element)
 402         else:
 403             return cmd('emph')(self, element)
 404
 405     def handle_obraz(self, element):
 406         frmt = self.options['format']
 407         name = element.attrib.get('nazwa', '').strip()
 408         image = frmt.get_image(name.strip())
 409         name = image.get_filename().rsplit('/', 1)[-1]
 410         img_path = "obraz/%s" % name.replace("_", "")
 411         frmt.attachments[img_path] = image
 412         return cmd("obraz", parms=[img_path])(self)
 413
 414     def handle_video(self, element):
 415         url = element.attrib.get('url')
 416         if not url:
 417             print '!! <video> missing url'
 418             return
 419         m = re.match(r'(?:https?://)?(?:www.)?youtube.com/watch\?(?:.*&)?v=([^&]+)(?:$|&)', url)
 420         if not m:
 421             print '!! unknown <video> url scheme:', url
 422             return
 423         name = m.group(1)
 424         thumb = IOFile.from_string(urlopen
 425             ("http://img.youtube.com/vi/%s/0.jpg" % name).read())
 426         img_path = "video/%s.jpg" % name.replace("_", "")
 427         self.options['format'].attachments[img_path] = thumb
 428         canon_url = "https://www.youtube.com/watch?v=%s" % name
 429         return cmd("video", parms=[img_path, canon_url])(self)
 430
 431
 432 class Exercise(EduModule):
 433     def __init__(self, *args, **kw):
 434         self.question_counter = 0
 435         super(Exercise, self).__init__(*args, **kw)
 436
 437     handle_rozw_kom = ifoption(teacher=True)(cmd('akap'))
 438
 439     def handle_cwiczenie(self, element):
 440         self.options = {
 441             'exercise': element.attrib['typ'],
 442             'sub_gen': True,
 443         }
 444         self.question_counter = 0
 445         self.piece_counter = 0
 446
 447         header = etree.Element("parm")
 448         header_cmd = etree.Element("cmd", name="naglowekpodrozdzial")
 449         header_cmd.append(header)
 450         header.text = u"Zadanie %d." % self.options['exercise_counter']
 451
 452         pre = etree.tostring(header_cmd, encoding=unicode)
 453         post = u""
 454         # Add a single <pytanie> tag if it's not there
 455         if not element.xpath(".//pytanie"):
 456             qpre, qpost = self.handle_pytanie(element)
 457             pre = pre + qpre
 458             post = qpost + post
 459         return pre, post
 460
 461     def handle_pytanie(self, element):
 462         """This will handle <cwiczenie> element, when there is no <pytanie>
 463         """
 464         self.question_counter += 1
 465         self.piece_counter = 0
 466         pre = post = u""
 467         if self.options['teacher'] and element.attrib.get('rozw'):
 468             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 469         return pre, post
 470
 471     def handle_punkt(self, element):
 472         pre, post = super(Exercise, self).handle_punkt(element)
 473         if self.options['teacher'] and element.attrib.get('rozw'):
 474             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 475         return pre, post
 476
 477     def solution_header(self):
 478         par = etree.Element("cmd", name="par")
 479         parm = etree.Element("parm")
 480         parm.text = u"Rozwiązanie:"
 481         par.append(parm)
 482         return etree.tostring(par)
 483
 484     def explicit_solution(self):
 485         if self.options['solution']:
 486             par = etree.Element("cmd", name="par")
 487             parm = etree.Element("parm")
 488             parm.text = self.options['solution']
 489             par.append(parm)
 490             return self.solution_header() + etree.tostring(par)
 491
 492
 493
 494 class Wybor(Exercise):
 495     def handle_cwiczenie(self, element):
 496         pre, post = super(Wybor, self).handle_cwiczenie(element)
 497         is_single_choice = True
 498         pytania = element.xpath(".//pytanie")
 499         if not pytania:
 500             pytania = [element]
 501         for p in pytania:
 502             solutions = re.split(r"[, ]+", p.attrib.get('rozw', ''))
 503             if len(solutions) != 1:
 504                 is_single_choice = False
 505                 break
 506             choices = p.xpath(".//*[@nazwa]")
 507             uniq = set()
 508             for n in choices: uniq.add(n.attrib.get('nazwa', ''))
 509             if len(choices) != len(uniq):
 510                 is_single_choice = False
 511                 break
 512
 513         self.options = {'single': is_single_choice}
 514         return pre, post
 515
 516     def handle_punkt(self, element):
 517         if self.options['exercise'] and element.attrib.get('nazwa', None):
 518             cmd = 'radio' if self.options['single'] else 'checkbox'
 519             return u'<cmd name="%s"/>' % cmd, ''
 520         else:
 521             return super(Wybor, self).handle_punkt(element)
 522
 523
 524 class Uporzadkuj(Exercise):
 525     def handle_pytanie(self, element):
 526         order_items = element.xpath(".//punkt/@rozw")
 527         return super(Uporzadkuj, self).handle_pytanie(element)
 528
 529
 530 class Przyporzadkuj(Exercise):
 531     def handle_lista(self, lista):
 532         header = etree.Element("parm")
 533         header_cmd = etree.Element("cmd", name="par")
 534         header_cmd.append(header)
 535         if 'nazwa' in lista.attrib:
 536             header.text = u"Kategorie:"
 537         elif 'cel' in lista.attrib:
 538             header.text = u"Elementy do przyporządkowania:"
 539         else:
 540             header.text = u"Lista:"
 541         pre, post = super(Przyporzadkuj, self).handle_lista(lista)
 542         pre = etree.tostring(header_cmd, encoding=unicode) + pre
 543         return pre, post
 544
 545
 546 class Luki(Exercise):
 547     def find_pieces(self, question):
 548         return question.xpath(".//luka")
 549
 550     def solution(self, piece):
 551         piece = deepcopy(piece)
 552         piece.tail = None
 553         sub = EduModule()
 554         return sub.generate(piece)
 555
 556     def handle_pytanie(self, element):
 557         qpre, qpost = super(Luki, self).handle_pytanie(element)
 558
 559         luki = self.find_pieces(element)
 560         random.shuffle(luki)
 561         self.words = u"<env name='itemize'>%s</env>" % (
 562             "".join("<cmd name='item'/>%s" % self.solution(luka) for luka in luki)
 563         )
 564         return qpre, qpost
 565
 566     def handle_opis(self, element):
 567         return '', self.words
 568
 569     def handle_luka(self, element):
 570         luka = "_" * 10
 571         if self.options['teacher']:
 572             piece = deepcopy(element)
 573             piece.tail = None
 574             sub = EduModule()
 575             text = sub.generate(piece)
 576             luka += u" [rozwiązanie: %s]" % text
 577         return luka
 578
 579
 580 class Zastap(Luki):
 581     def find_pieces(self, question):
 582         return question.xpath(".//zastap")
 583
 584     def solution(self, piece):
 585         return piece.attrib.get('rozw', '')
 586
 587     def list_header(self):
 588         return u"Elementy do wstawienia"
 589
 590     def handle_zastap(self, element):
 591         piece = deepcopy(element)
 592         piece.tail = None
 593         sub = EduModule()
 594         text = sub.generate(piece)
 595         if self.options['teacher'] and element.attrib.get('rozw'):
 596             text += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 597         return text
 598
 599
 600 class PrawdaFalsz(Exercise):
 601     def handle_punkt(self, element):
 602         pre, post = super(PrawdaFalsz, self).handle_punkt(element)
 603         if 'rozw' in element.attrib:
 604             post += u" [Prawda/Fałsz]"
 605         return pre, post
 606
 607
 608
 609 def fix_lists(tree):
 610     lists = tree.xpath(".//lista")
 611     for l in lists:
 612         if l.text:
 613             p = l.getprevious()
 614             if p is not None:
 615                 if p.tail is None: p.tail = ''
 616                 p.tail += l.text
 617             else:
 618                 p = l.getparent()
 619                 if p.text is None: p.text = ''
 620                 p.text += l.text
 621             l.text = ''
 622     return tree
 623
 624
 625 class EduModulePDFFormat(PDFFormat):
 626     style = get_resource('res/styles/edumed/pdf/edumed.sty')
 627
 628     def get_texml(self):
 629         substitute_hyphens(self.wldoc.edoc)
 630         fix_hanging(self.wldoc.edoc)
 631
 632         self.attachments = {}
 633         edumod = EduModule({
 634             "wldoc": self.wldoc,
 635             "format": self,
 636             "teacher": self.customization.get('teacher'),
 637         })
 638         texml = edumod.generate(fix_lists(self.wldoc.edoc.getroot())).encode('utf-8')
 639
 640         open("/tmp/texml.xml", "w").write(texml)
 641         return texml
 642
 643     def get_tex_dir(self):
 644         temp = super(EduModulePDFFormat, self).get_tex_dir()
 645         shutil.copy(get_resource('res/styles/edumed/logo.png'), temp)
 646         for name, iofile in self.attachments.items():
 647             iofile.save_as(os.path.join(temp, name))
 648         return temp
 649
 650     def get_image(self, name):
 651         return self.wldoc.source.attachments[name]
 652