librarian/pypdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from copy import deepcopy
  13 import os.path
  14 import shutil
  15 import re
  16 import random
  17 from urllib2 import urlopen
  18
  19 from lxml import etree
  20
  21 from xmlutils import Xmill, tag, tagged, ifoption, tag_open_close
  22 from librarian.dcparser import Person
  23 from librarian import DCNS, get_resource, IOFile
  24 from librarian import functions
  25 from pdf import PDFFormat, substitute_hyphens, fix_hanging
  26
  27
  28 def escape(really):
  29     def deco(f):
  30         def _wrap(*args, **kw):
  31             value = f(*args, **kw)
  32
  33             prefix = (u'<TeXML escape="%d">' % (really and 1 or 0))
  34             postfix = u'</TeXML>'
  35             if isinstance(value, list):
  36                 import pdb; pdb.set_trace()
  37             if isinstance(value, tuple):
  38                 return prefix + value[0], value[1] + postfix
  39             else:
  40                 return prefix + value + postfix
  41         return _wrap
  42     return deco
  43
  44
  45 def cmd(name, parms=None):
  46     def wrap(self, element=None):
  47         pre, post = tag_open_close('cmd', name=name)
  48
  49         if parms:
  50             for parm in parms:
  51                 e = etree.Element("parm")
  52                 e.text = parm
  53                 pre += etree.tostring(e)
  54         if element is not None:
  55             pre += "<parm>"
  56             post = "</parm>" + post
  57             return pre, post
  58         else:
  59             return pre + post
  60     return wrap
  61
  62
  63 def mark_alien_characters(text):
  64     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  65     return text
  66
  67
  68 class EduModule(Xmill):
  69     def __init__(self, options=None, state=None):
  70         super(EduModule, self).__init__(options, state)
  71         self.activity_counter = 0
  72         self.activity_last = None
  73         self.exercise_counter = 0
  74
  75         def swap_endlines(txt):
  76             if self.options['strofa']:
  77                 txt = txt.replace("/\n", '<ctrl ch="\\"/>')
  78             return txt
  79         self.register_text_filter(swap_endlines)
  80         self.register_text_filter(functions.substitute_entities)
  81         self.register_text_filter(mark_alien_characters)
  82
  83     def get_dc(self, element, dc_field, single=False):
  84         values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri}))
  85         if single:
  86             return values[0]
  87         return values
  88
  89     def handle_rdf__RDF(self, _):
  90         "skip metadata in generation"
  91         return
  92
  93     @escape(True)
  94     def get_rightsinfo(self, element):
  95         rights_lic = self.get_dc(element, 'rights.license', True)
  96         return u'<cmd name="rightsinfostr">' + \
  97           (rights_lic and u'<opt>%s</opt>' % rights_lic or '') +\
  98           u'<parm>%s</parm>' % self.get_dc(element, 'rights', True) +\
  99           u'</cmd>'
 100
 101     @escape(True)
 102     def get_authors(self, element, which=None):
 103         dc = self.options['wldoc'].book_info
 104         if which is None:
 105             authors = dc.authors_textbook + \
 106                 dc.authors_scenario + \
 107                 dc.authors_expert
 108         else:
 109             authors = getattr(dc, "authors_%s" % which)
 110         return u', '.join(author.readable() for author in authors if author)
 111
 112     @escape(1)
 113     def get_title(self, element):
 114         return self.get_dc(element, 'title', True)
 115
 116     def handle_utwor(self, element):
 117         lines = [
 118             u'''
 119     <TeXML xmlns="http://getfo.sourceforge.net/texml/ns1">
 120         <TeXML escape="0">
 121         \\documentclass[%s]{wl}
 122         \\usepackage{style}''' % self.options['customization_str'],
 123     self.options['has_cover'] and '\usepackage{makecover}',
 124     (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or
 125     (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or
 126     (self.options['morefloats'] == 'none' and
 127      u'''\\IfFileExists{morefloats.sty}{
 128             \\usepackage{morefloats}
 129         }{}'''),
 130     u'''\\def\\authors{%s}''' % self.get_authors(element),
 131     u'''\\def\\authorsexpert{%s}''' % self.get_authors(element, 'expert'),
 132     u'''\\def\\authorsscenario{%s}''' % self.get_authors(element, 'scenario'),
 133     u'''\\def\\authorstextbook{%s}''' % self.get_authors(element, 'textbook'),
 134
 135     u'''\\author{\\authors}''',
 136     u'''\\title{%s}''' % self.get_title(element),
 137     u'''\\def\\bookurl{%s}''' % self.options['wldoc'].book_info.url.canonical(),
 138     u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element),
 139     u'</TeXML>']
 140
 141         return u"".join(filter(None, lines)), u'</TeXML>'
 142
 143
 144     @escape(1)
 145     def handle_powiesc(self, element):
 146         return u"""
 147     <env name="document">
 148     <cmd name="maketitle"/>
 149     """, """<cmd name="editorialsection" /></env>"""
 150
 151     @escape(1)
 152     def handle_texcommand(self, element):
 153         cmd = functions.texcommand(element.tag)
 154         return u'<TeXML escape="1"><cmd name="%s"><parm>' % cmd, u'</parm></cmd></TeXML>'
 155
 156     handle_akap = \
 157     handle_akap = \
 158     handle_akap_cd = \
 159     handle_akap_cd = \
 160     handle_akap_dialog = \
 161     handle_akap_dialog = \
 162     handle_autor_utworu = \
 163     handle_dedykacja = \
 164     handle_didaskalia = \
 165     handle_didask_tekst = \
 166     handle_dlugi_cytat = \
 167     handle_dzielo_nadrzedne = \
 168     handle_lista_osoba = \
 169     handle_mat = \
 170     handle_miejsce_czas = \
 171     handle_motto = \
 172     handle_motto_podpis = \
 173     handle_naglowek_akt = \
 174     handle_naglowek_czesc = \
 175     handle_naglowek_listy = \
 176     handle_naglowek_osoba = \
 177     handle_naglowek_scena = \
 178     handle_nazwa_utworu = \
 179     handle_nota = \
 180     handle_osoba = \
 181     handle_pa = \
 182     handle_pe = \
 183     handle_podtytul = \
 184     handle_poezja_cyt = \
 185     handle_pr = \
 186     handle_pt = \
 187     handle_sekcja_asterysk = \
 188     handle_sekcja_swiatlo = \
 189     handle_separator_linia = \
 190     handle_slowo_obce = \
 191     handle_srodtytul = \
 192     handle_tytul_dziela = \
 193     handle_wyroznienie = \
 194     handle_dywiz = \
 195     handle_texcommand
 196
 197     def handle_naglowek_rozdzial(self, element):
 198         if not self.options['teacher']:
 199             if element.text.startswith((u'Wiedza', u'Zadania', u'Słowniczek', u'Dla ucznia')):
 200                 self.state['mute'] = False
 201             else:
 202                 self.state['mute'] = True
 203                 return None
 204         return self.handle_texcommand(element)
 205     handle_naglowek_rozdzial.unmuter = True
 206
 207     def handle_naglowek_podrozdzial(self, element):
 208         self.activity_counter = 0
 209         if not self.options['teacher']:
 210             if element.text.startswith(u'Dla ucznia'):
 211                 self.state['mute'] = False
 212                 return None
 213             elif element.text.startswith(u'Dla nauczyciela'):
 214                 self.state['mute'] = True
 215                 return None
 216             elif self.state['mute']:
 217                 return None
 218         return self.handle_texcommand(element)
 219     handle_naglowek_podrozdzial.unmuter = True
 220
 221     def handle_uwaga(self, _e):
 222         return None
 223     def handle_extra(self, _e):
 224         return None
 225
 226     def handle_nbsp(self, _e):
 227         return '<spec cat="tilde" />'
 228
 229     _handle_strofa = cmd("strofa")
 230
 231     def handle_strofa(self, element):
 232         self.options = {'strofa': True}
 233         return self._handle_strofa(element)
 234
 235     def handle_aktywnosc(self, element):
 236         self.activity_counter += 1
 237         self.options = {
 238             'activity': True,
 239             'activity_counter': self.activity_counter,
 240             'sub_gen': True,
 241         }
 242         submill = EduModule(self.options, self.state)
 243
 244         if element.xpath('opis'):
 245             opis = submill.generate(element.xpath('opis')[0])
 246         else:
 247             opis = ''
 248
 249         n = element.xpath('wskazowki')
 250         if n: wskazowki = submill.generate(n[0])
 251
 252         else: wskazowki = ''
 253         n = element.xpath('pomoce')
 254
 255         if n: pomoce = submill.generate(n[0])
 256         else: pomoce = ''
 257
 258         forma = ''.join(element.xpath('forma/text()'))
 259
 260         czas = ''.join(element.xpath('czas/text()'))
 261
 262         counter = self.activity_counter
 263
 264         if element.getnext().tag == 'aktywnosc' or (self.activity_last and self.activity_last.getnext() == element):
 265             counter_tex = """<cmd name="activitycounter"><parm>%(counter)d.</parm></cmd>""" % locals()
 266         else:
 267             counter_tex = ''
 268
 269         self.activity_last = element
 270
 271         return u"""
 272 <cmd name="noindent" />
 273 %(counter_tex)s
 274 <cmd name="activityinfo"><parm>
 275  <cmd name="activitytime"><parm>%(czas)s</parm></cmd>
 276  <cmd name="activityform"><parm>%(forma)s</parm></cmd>
 277  <cmd name="activitytools"><parm>%(pomoce)s</parm></cmd>
 278 </parm></cmd>
 279
 280
 281 %(opis)s
 282
 283 %(wskazowki)s
 284 """ % locals()
 285
 286     handle_opis = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 287     handle_wskazowki = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 288
 289     @ifoption(sub_gen=True)
 290     def handle_pomoce(self, _):
 291         return "Pomoce: ", ""
 292
 293     def handle_czas(self, *_):
 294         return
 295
 296     def handle_forma(self, *_):
 297         return
 298
 299     def handle_lista(self, element, attrs={}):
 300         ltype = element.attrib.get('typ', 'punkt')
 301         if not element.findall("punkt"):
 302             if ltype == 'czytelnia':
 303                 return 'W przygotowaniu.'
 304             else:
 305                 return None
 306         if ltype == 'slowniczek':
 307             surl = element.attrib.get('src', None)
 308             if surl is None:
 309                 # print '** missing src on <slowniczek>, setting default'
 310                 surl = 'http://edukacjamedialna.edu.pl/lekcje/slowniczek/'
 311             sxml = etree.fromstring(self.options['wldoc'].provider.by_uri(surl).get_string())
 312             self.options = {'slowniczek': True, 'slowniczek_xml': sxml }
 313
 314         listcmd = {'num': 'enumerate',
 315                'punkt': 'itemize',
 316                'alfa': 'itemize',
 317                'slowniczek': 'itemize',
 318                'czytelnia': 'itemize'}[ltype]
 319
 320         return u'<env name="%s">' % listcmd, u'</env>'
 321
 322     def handle_punkt(self, element):
 323         return '<cmd name="item"/>', ''
 324
 325     def handle_cwiczenie(self, element):
 326         exercise_handlers = {
 327             'wybor': Wybor,
 328             'uporzadkuj': Uporzadkuj,
 329             'luki': Luki,
 330             'zastap': Zastap,
 331             'przyporzadkuj': Przyporzadkuj,
 332             'prawdafalsz': PrawdaFalsz
 333         }
 334
 335         typ = element.attrib['typ']
 336         self.exercise_counter += 1
 337         if not typ in exercise_handlers:
 338             return '(no handler)'
 339         self.options = {'exercise_counter': self.exercise_counter}
 340         handler = exercise_handlers[typ](self.options, self.state)
 341         return handler.generate(element)
 342
 343     # XXX this is copied from pyhtml.py, except for return and
 344     # should be refactored for no code duplication
 345     def handle_definiendum(self, element):
 346         nxt = element.getnext()
 347         definiens_s = ''
 348
 349         # let's pull definiens from another document
 350         if self.options['slowniczek_xml'] is not None and (nxt is None or nxt.tag != 'definiens'):
 351             sxml = self.options['slowniczek_xml']
 352             assert element.text != ''
 353             if "'" in (element.text or ''):
 354                 defloc = sxml.xpath("//definiendum[text()=\"%s\"]" % (element.text or '').strip())
 355             else:
 356                 defloc = sxml.xpath("//definiendum[text()='%s']" % (element.text or '').strip())
 357             if defloc:
 358                 definiens = defloc[0].getnext()
 359                 if definiens.tag == 'definiens':
 360                     subgen = EduModule(self.options, self.state)
 361                     definiens_s = subgen.generate(definiens)
 362
 363         return u'<cmd name="textbf"><parm>', u"</parm></cmd>: " + definiens_s
 364
 365     def handle_definiens(self, element):
 366         return u"", u""
 367
 368     def handle_podpis(self, element):
 369         return u"""<env name="figure">""", u"</env>"
 370
 371     def handle_tabela(self, element):
 372         max_col = 0
 373         for w in element.xpath("wiersz"):
 374             ks = w.xpath("kol")
 375             if max_col < len(ks):
 376                 max_col = len(ks)
 377         self.options = {'columnts': max_col}
 378         # styling:
 379                 #        has_frames = int(element.attrib.get("ramki", "0"))
 380                 #        if has_frames: frames_c = "framed"
 381                 #        else: frames_c = ""
 382                 #        return u"""<table class="%s">""" % frames_c, u"</table>"
 383         return u'''
 384 <cmd name="begin"><parm>tabular</parm><parm>%s</parm></cmd>
 385     ''' % ('l' * max_col), \
 386     u'''<cmd name="end"><parm>tabular</parm></cmd>'''
 387
 388     @escape(1)
 389     def handle_wiersz(self, element):
 390         return u"", u'<ctrl ch="\\"/>'
 391
 392     @escape(1)
 393     def handle_kol(self, element):
 394         if element.getnext() is not None:
 395             return u"", u'<spec cat="align" />'
 396         return u"", u""
 397
 398     def handle_link(self, element):
 399         if element.attrib.get('url'):
 400             url = element.attrib.get('url')
 401             if url == element.text:
 402                 return cmd('url')(self, element)
 403             else:
 404                 return cmd('href', parms=[element.attrib['url']])(self, element)
 405         else:
 406             return cmd('emph')(self, element)
 407
 408     def handle_obraz(self, element):
 409         frmt = self.options['format']
 410         name = element.attrib.get('nazwa', '').strip()
 411         image = frmt.get_image(name.strip())
 412         name = image.get_filename().rsplit('/', 1)[-1]
 413         img_path = "obraz/%s" % name.replace("_", "")
 414         frmt.attachments[img_path] = image
 415         return cmd("obraz", parms=[img_path])(self)
 416
 417     def handle_video(self, element):
 418         url = element.attrib.get('url')
 419         if not url:
 420             print '!! <video> missing url'
 421             return
 422         m = re.match(r'(?:https?://)?(?:www.)?youtube.com/watch\?(?:.*&)?v=([^&]+)(?:$|&)', url)
 423         if not m:
 424             print '!! unknown <video> url scheme:', url
 425             return
 426         name = m.group(1)
 427         thumb = IOFile.from_string(urlopen
 428             ("http://img.youtube.com/vi/%s/0.jpg" % name).read())
 429         img_path = "video/%s.jpg" % name.replace("_", "")
 430         self.options['format'].attachments[img_path] = thumb
 431         canon_url = "https://www.youtube.com/watch?v=%s" % name
 432         return cmd("video", parms=[img_path, canon_url])(self)
 433
 434
 435 class Exercise(EduModule):
 436     def __init__(self, *args, **kw):
 437         self.question_counter = 0
 438         super(Exercise, self).__init__(*args, **kw)
 439
 440     handle_rozw_kom = ifoption(teacher=True)(cmd('akap'))
 441
 442     def handle_cwiczenie(self, element):
 443         self.options = {
 444             'exercise': element.attrib['typ'],
 445             'sub_gen': True,
 446         }
 447         self.question_counter = 0
 448         self.piece_counter = 0
 449
 450         header = etree.Element("parm")
 451         header_cmd = etree.Element("cmd", name="naglowekpodrozdzial")
 452         header_cmd.append(header)
 453         header.text = u"Zadanie %d." % self.options['exercise_counter']
 454
 455         pre = etree.tostring(header_cmd, encoding=unicode)
 456         post = u""
 457         # Add a single <pytanie> tag if it's not there
 458         if not element.xpath(".//pytanie"):
 459             qpre, qpost = self.handle_pytanie(element)
 460             pre = pre + qpre
 461             post = qpost + post
 462         return pre, post
 463
 464     def handle_pytanie(self, element):
 465         """This will handle <cwiczenie> element, when there is no <pytanie>
 466         """
 467         self.question_counter += 1
 468         self.piece_counter = 0
 469         pre = post = u""
 470         if self.options['teacher'] and element.attrib.get('rozw'):
 471             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 472         return pre, post
 473
 474     def handle_punkt(self, element):
 475         pre, post = super(Exercise, self).handle_punkt(element)
 476         if self.options['teacher'] and element.attrib.get('rozw'):
 477             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 478         return pre, post
 479
 480     def solution_header(self):
 481         par = etree.Element("cmd", name="par")
 482         parm = etree.Element("parm")
 483         parm.text = u"Rozwiązanie:"
 484         par.append(parm)
 485         return etree.tostring(par)
 486
 487     def explicit_solution(self):
 488         if self.options['solution']:
 489             par = etree.Element("cmd", name="par")
 490             parm = etree.Element("parm")
 491             parm.text = self.options['solution']
 492             par.append(parm)
 493             return self.solution_header() + etree.tostring(par)
 494
 495
 496
 497 class Wybor(Exercise):
 498     def handle_cwiczenie(self, element):
 499         pre, post = super(Wybor, self).handle_cwiczenie(element)
 500         is_single_choice = True
 501         pytania = element.xpath(".//pytanie")
 502         if not pytania:
 503             pytania = [element]
 504         for p in pytania:
 505             solutions = re.split(r"[, ]+", p.attrib.get('rozw', ''))
 506             if len(solutions) != 1:
 507                 is_single_choice = False
 508                 break
 509             choices = p.xpath(".//*[@nazwa]")
 510             uniq = set()
 511             for n in choices: uniq.add(n.attrib.get('nazwa', ''))
 512             if len(choices) != len(uniq):
 513                 is_single_choice = False
 514                 break
 515
 516         self.options = {'single': is_single_choice}
 517         return pre, post
 518
 519     def handle_punkt(self, element):
 520         if self.options['exercise'] and element.attrib.get('nazwa', None):
 521             cmd = 'radio' if self.options['single'] else 'checkbox'
 522             return u'<cmd name="%s"/>' % cmd, ''
 523         else:
 524             return super(Wybor, self).handle_punkt(element)
 525
 526
 527 class Uporzadkuj(Exercise):
 528     def handle_pytanie(self, element):
 529         order_items = element.xpath(".//punkt/@rozw")
 530         return super(Uporzadkuj, self).handle_pytanie(element)
 531
 532
 533 class Przyporzadkuj(Exercise):
 534     def handle_lista(self, lista):
 535         header = etree.Element("parm")
 536         header_cmd = etree.Element("cmd", name="par")
 537         header_cmd.append(header)
 538         if 'nazwa' in lista.attrib:
 539             header.text = u"Kategorie:"
 540         elif 'cel' in lista.attrib:
 541             header.text = u"Elementy do przyporządkowania:"
 542         else:
 543             header.text = u"Lista:"
 544         pre, post = super(Przyporzadkuj, self).handle_lista(lista)
 545         pre = etree.tostring(header_cmd, encoding=unicode) + pre
 546         return pre, post
 547
 548
 549 class Luki(Exercise):
 550     def find_pieces(self, question):
 551         return question.xpath(".//luka")
 552
 553     def solution(self, piece):
 554         piece = deepcopy(piece)
 555         piece.tail = None
 556         sub = EduModule()
 557         return sub.generate(piece)
 558
 559     def handle_pytanie(self, element):
 560         qpre, qpost = super(Luki, self).handle_pytanie(element)
 561
 562         luki = self.find_pieces(element)
 563         random.shuffle(luki)
 564         self.words = u"<env name='itemize'>%s</env>" % (
 565             "".join("<cmd name='item'/>%s" % self.solution(luka) for luka in luki)
 566         )
 567         return qpre, qpost
 568
 569     def handle_opis(self, element):
 570         return '', self.words
 571
 572     def handle_luka(self, element):
 573         luka = "_" * 10
 574         if self.options['teacher']:
 575             piece = deepcopy(element)
 576             piece.tail = None
 577             sub = EduModule()
 578             text = sub.generate(piece)
 579             luka += u" [rozwiązanie: %s]" % text
 580         return luka
 581
 582
 583 class Zastap(Luki):
 584     def find_pieces(self, question):
 585         return question.xpath(".//zastap")
 586
 587     def solution(self, piece):
 588         return piece.attrib.get('rozw', '')
 589
 590     def list_header(self):
 591         return u"Elementy do wstawienia"
 592
 593     def handle_zastap(self, element):
 594         piece = deepcopy(element)
 595         piece.tail = None
 596         sub = EduModule()
 597         text = sub.generate(piece)
 598         if self.options['teacher'] and element.attrib.get('rozw'):
 599             text += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 600         return text
 601
 602
 603 class PrawdaFalsz(Exercise):
 604     def handle_punkt(self, element):
 605         pre, post = super(PrawdaFalsz, self).handle_punkt(element)
 606         if 'rozw' in element.attrib:
 607             post += u" [Prawda/Fałsz]"
 608         return pre, post
 609
 610
 611
 612 def fix_lists(tree):
 613     lists = tree.xpath(".//lista")
 614     for l in lists:
 615         if l.text:
 616             p = l.getprevious()
 617             if p is not None:
 618                 if p.tail is None: p.tail = ''
 619                 p.tail += l.text
 620             else:
 621                 p = l.getparent()
 622                 if p.text is None: p.text = ''
 623                 p.text += l.text
 624             l.text = ''
 625     return tree
 626
 627
 628 class EduModulePDFFormat(PDFFormat):
 629     style = get_resource('res/styles/edumed/pdf/edumed.sty')
 630
 631     def get_texml(self):
 632         substitute_hyphens(self.wldoc.edoc)
 633         fix_hanging(self.wldoc.edoc)
 634
 635         self.attachments = {}
 636         edumod = EduModule({
 637             "wldoc": self.wldoc,
 638             "format": self,
 639             "teacher": self.customization.get('teacher'),
 640         })
 641         texml = edumod.generate(fix_lists(self.wldoc.edoc.getroot())).encode('utf-8')
 642
 643         open("/tmp/texml.xml", "w").write(texml)
 644         return texml
 645
 646     def get_tex_dir(self):
 647         temp = super(EduModulePDFFormat, self).get_tex_dir()
 648         shutil.copy(get_resource('res/styles/edumed/logo.png'), temp)
 649         for name, iofile in self.attachments.items():
 650             iofile.save_as(os.path.join(temp, name))
 651         return temp
 652
 653     def get_image(self, name):
 654         return self.wldoc.source.attachments[name]
 655