librarian/pypdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from copy import deepcopy
  13 import os.path
  14 import shutil
  15 import re
  16 import random
  17 from urllib2 import urlopen
  18
  19 from lxml import etree
  20
  21 from xmlutils import Xmill, tag, tagged, ifoption, tag_open_close
  22 from librarian.dcparser import Person
  23 from librarian import DCNS, get_resource, IOFile
  24 from librarian import functions
  25 from pdf import PDFFormat, substitute_hyphens, fix_hanging
  26
  27
  28 def escape(really):
  29     def deco(f):
  30         def _wrap(*args, **kw):
  31             value = f(*args, **kw)
  32
  33             prefix = (u'<TeXML escape="%d">' % (really and 1 or 0))
  34             postfix = u'</TeXML>'
  35             if isinstance(value, list):
  36                 import pdb; pdb.set_trace()
  37             if isinstance(value, tuple):
  38                 return prefix + value[0], value[1] + postfix
  39             else:
  40                 return prefix + value + postfix
  41         return _wrap
  42     return deco
  43
  44
  45 def cmd(name, parms=None):
  46     def wrap(self, element=None):
  47         pre, post = tag_open_close('cmd', name=name)
  48
  49         if parms:
  50             for parm in parms:
  51                 e = etree.Element("parm")
  52                 e.text = parm
  53                 pre += etree.tostring(e)
  54         if element is not None:
  55             pre += "<parm>"
  56             post = "</parm>" + post
  57             return pre, post
  58         else:
  59             return pre + post
  60     return wrap
  61
  62
  63 def mark_alien_characters(text):
  64     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  65     return text
  66
  67
  68 class EduModule(Xmill):
  69     def __init__(self, options=None, state=None):
  70         super(EduModule, self).__init__(options, state)
  71         self.activity_counter = 0
  72         self.activity_last = None
  73         self.exercise_counter = 0
  74
  75         def swap_endlines(txt):
  76             if self.options['strofa']:
  77                 txt = txt.replace("/\n", '<ctrl ch="\\"/>')
  78             return txt
  79         self.register_text_filter(swap_endlines)
  80         self.register_text_filter(functions.substitute_entities)
  81         self.register_text_filter(mark_alien_characters)
  82
  83     def get_dc(self, element, dc_field, single=False):
  84         values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri}))
  85         if single:
  86             return values[0]
  87         return values
  88
  89     def handle_rdf__RDF(self, _):
  90         "skip metadata in generation"
  91         return
  92
  93     @escape(True)
  94     def get_rightsinfo(self, element):
  95         rights_lic = self.get_dc(element, 'rights.license', True)
  96         return u'<cmd name="rightsinfostr">' + \
  97           (rights_lic and u'<opt>%s</opt>' % rights_lic or '') +\
  98           u'<parm>%s</parm>' % self.get_dc(element, 'rights', True) +\
  99           u'</cmd>'
 100
 101     @escape(True)
 102     def get_authors(self, element, which=None):
 103         dc = self.options['wldoc'].book_info
 104         if which is None:
 105             authors = dc.authors_textbook + \
 106                 dc.authors_scenario + \
 107                 dc.authors_expert
 108         else:
 109             authors = getattr(dc, "authors_%s" % which)
 110         return u', '.join(author.readable() for author in authors)
 111
 112     @escape(1)
 113     def get_title(self, element):
 114         return self.get_dc(element, 'title', True)
 115
 116     def handle_utwor(self, element):
 117         lines = [
 118             u'''
 119     <TeXML xmlns="http://getfo.sourceforge.net/texml/ns1">
 120         <TeXML escape="0">
 121         \\documentclass[%s]{wl}
 122         \\usepackage{style}''' % self.options['customization_str'],
 123     self.options['has_cover'] and '\usepackage{makecover}',
 124     (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or
 125     (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or
 126     (self.options['morefloats'] == 'none' and
 127      u'''\\IfFileExists{morefloats.sty}{
 128             \\usepackage{morefloats}
 129         }{}'''),
 130     u'''\\def\\authors{%s}''' % self.get_authors(element),
 131     u'''\\def\\authorsexpert{%s}''' % self.get_authors(element, 'expert'),
 132     u'''\\def\\authorsscenario{%s}''' % self.get_authors(element, 'scenario'),
 133     u'''\\def\\authorstextbook{%s}''' % self.get_authors(element, 'textbook'),
 134
 135     u'''\\author{\\authors}''',
 136     u'''\\title{%s}''' % self.get_title(element),
 137     u'''\\def\\bookurl{%s}''' % self.options['wldoc'].book_info.url.canonical(),
 138     u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element),
 139     u'</TeXML>']
 140
 141         return u"".join(filter(None, lines)), u'</TeXML>'
 142
 143
 144     @escape(1)
 145     def handle_powiesc(self, element):
 146         return u"""
 147     <env name="document">
 148     <cmd name="maketitle"/>
 149     """, """<cmd name="editorialsection" /></env>"""
 150
 151     @escape(1)
 152     def handle_texcommand(self, element):
 153         cmd = functions.texcommand(element.tag)
 154         return u'<TeXML escape="1"><cmd name="%s"><parm>' % cmd, u'</parm></cmd></TeXML>'
 155
 156     handle_akap = \
 157     handle_akap = \
 158     handle_akap_cd = \
 159     handle_akap_cd = \
 160     handle_akap_dialog = \
 161     handle_akap_dialog = \
 162     handle_autor_utworu = \
 163     handle_dedykacja = \
 164     handle_didaskalia = \
 165     handle_didask_tekst = \
 166     handle_dlugi_cytat = \
 167     handle_dzielo_nadrzedne = \
 168     handle_lista_osoba = \
 169     handle_mat = \
 170     handle_miejsce_czas = \
 171     handle_motto = \
 172     handle_motto_podpis = \
 173     handle_naglowek_akt = \
 174     handle_naglowek_czesc = \
 175     handle_naglowek_listy = \
 176     handle_naglowek_osoba = \
 177     handle_naglowek_scena = \
 178     handle_nazwa_utworu = \
 179     handle_nota = \
 180     handle_osoba = \
 181     handle_pa = \
 182     handle_pe = \
 183     handle_podtytul = \
 184     handle_poezja_cyt = \
 185     handle_pr = \
 186     handle_pt = \
 187     handle_sekcja_asterysk = \
 188     handle_sekcja_swiatlo = \
 189     handle_separator_linia = \
 190     handle_slowo_obce = \
 191     handle_srodtytul = \
 192     handle_tytul_dziela = \
 193     handle_wyroznienie = \
 194     handle_dywiz = \
 195     handle_texcommand
 196
 197     def handle_naglowek_rozdzial(self, element):
 198         if not self.options['teacher']:
 199             if element.text.startswith((u'Wiedza', u'Zadania', u'Słowniczek')):
 200                 self.state['mute'] = False
 201             else:
 202                 self.state['mute'] = True
 203                 return None
 204         return self.handle_texcommand(element)
 205     handle_naglowek_rozdzial.unmuter = True
 206
 207     def handle_naglowek_podrozdzial(self, element):
 208         self.activity_counter = 0
 209         return self.handle_texcommand(element)
 210
 211     def handle_uwaga(self, _e):
 212         return None
 213     def handle_extra(self, _e):
 214         return None
 215
 216     def handle_nbsp(self, _e):
 217         return '<spec cat="tilde" />'
 218
 219     _handle_strofa = cmd("strofa")
 220
 221     def handle_strofa(self, element):
 222         self.options = {'strofa': True}
 223         return self._handle_strofa(element)
 224
 225     def handle_aktywnosc(self, element):
 226         self.activity_counter += 1
 227         self.options = {
 228             'activity': True,
 229             'activity_counter': self.activity_counter,
 230             'sub_gen': True,
 231         }
 232         submill = EduModule(self.options, self.state)
 233
 234         if element.xpath('opis'):
 235             opis = submill.generate(element.xpath('opis')[0])
 236         else:
 237             opis = ''
 238
 239         n = element.xpath('wskazowki')
 240         if n: wskazowki = submill.generate(n[0])
 241
 242         else: wskazowki = ''
 243         n = element.xpath('pomoce')
 244
 245         if n: pomoce = submill.generate(n[0])
 246         else: pomoce = ''
 247
 248         forma = ''.join(element.xpath('forma/text()'))
 249
 250         czas = ''.join(element.xpath('czas/text()'))
 251
 252         counter = self.activity_counter
 253
 254         if element.getnext().tag == 'aktywnosc' or self.activity_last.getnext() == element:
 255             counter_tex = """<cmd name="activitycounter"><parm>%(counter)d.</parm></cmd>""" % locals()
 256         else:
 257             counter_tex = ''
 258
 259         self.activity_last = element
 260
 261         return u"""
 262 <cmd name="noindent" />
 263 %(counter_tex)s
 264 <cmd name="activityinfo"><parm>
 265  <cmd name="activitytime"><parm>%(czas)s</parm></cmd>
 266  <cmd name="activityform"><parm>%(forma)s</parm></cmd>
 267  <cmd name="activitytools"><parm>%(pomoce)s</parm></cmd>
 268 </parm></cmd>
 269
 270
 271 %(opis)s
 272
 273 %(wskazowki)s
 274 """ % locals()
 275
 276     handle_opis = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 277     handle_wskazowki = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 278
 279     @ifoption(sub_gen=True)
 280     def handle_pomoce(self, _):
 281         return "Pomoce: ", ""
 282
 283     def handle_czas(self, *_):
 284         return
 285
 286     def handle_forma(self, *_):
 287         return
 288
 289     def handle_lista(self, element, attrs={}):
 290         ltype = element.attrib.get('typ', 'punkt')
 291         if not element.findall("punkt"):
 292             if ltype == 'czytelnia':
 293                 return 'W przygotowaniu.'
 294             else:
 295                 return None
 296         if ltype == 'slowniczek':
 297             surl = element.attrib.get('src', None)
 298             if surl is None:
 299                 # print '** missing src on <slowniczek>, setting default'
 300                 surl = 'http://edukacjamedialna.edu.pl/lekcje/slowniczek/'
 301             sxml = None
 302             if surl:
 303                 sxml = etree.fromstring(self.options['wldoc'].provider.by_uri(surl).get_string())
 304             self.options = {'slowniczek': True, 'slowniczek_xml': sxml }
 305
 306         listcmd = {'num': 'enumerate',
 307                'punkt': 'itemize',
 308                'alfa': 'itemize',
 309                'slowniczek': 'itemize',
 310                'czytelnia': 'itemize'}[ltype]
 311
 312         return u'<env name="%s">' % listcmd, u'</env>'
 313
 314     def handle_punkt(self, element):
 315         return '<cmd name="item"/>', ''
 316
 317     def handle_cwiczenie(self, element):
 318         exercise_handlers = {
 319             'wybor': Wybor,
 320             'uporzadkuj': Uporzadkuj,
 321             'luki': Luki,
 322             'zastap': Zastap,
 323             'przyporzadkuj': Przyporzadkuj,
 324             'prawdafalsz': PrawdaFalsz
 325         }
 326
 327         typ = element.attrib['typ']
 328         self.exercise_counter += 1
 329         if not typ in exercise_handlers:
 330             return '(no handler)'
 331         self.options = {'exercise_counter': self.exercise_counter}
 332         handler = exercise_handlers[typ](self.options, self.state)
 333         return handler.generate(element)
 334
 335     # XXX this is copied from pyhtml.py, except for return and
 336     # should be refactored for no code duplication
 337     def handle_definiendum(self, element):
 338         nxt = element.getnext()
 339         definiens_s = ''
 340
 341         # let's pull definiens from another document
 342         if self.options['slowniczek_xml'] is not None and (nxt is None or nxt.tag != 'definiens'):
 343             sxml = self.options['slowniczek_xml']
 344             assert element.text != ''
 345             defloc = sxml.xpath("//definiendum[text()='%s']" % element.text)
 346             if defloc:
 347                 definiens = defloc[0].getnext()
 348                 if definiens.tag == 'definiens':
 349                     subgen = EduModule(self.options, self.state)
 350                     definiens_s = subgen.generate(definiens)
 351
 352         return u'<cmd name="textbf"><parm>', u"</parm></cmd>: " + definiens_s
 353
 354     def handle_definiens(self, element):
 355         return u"", u""
 356
 357     def handle_podpis(self, element):
 358         return u"""<env name="figure">""", u"</env>"
 359
 360     def handle_tabela(self, element):
 361         max_col = 0
 362         for w in element.xpath("wiersz"):
 363             ks = w.xpath("kol")
 364             if max_col < len(ks):
 365                 max_col = len(ks)
 366         self.options = {'columnts': max_col}
 367         # styling:
 368                 #        has_frames = int(element.attrib.get("ramki", "0"))
 369                 #        if has_frames: frames_c = "framed"
 370                 #        else: frames_c = ""
 371                 #        return u"""<table class="%s">""" % frames_c, u"</table>"
 372         return u'''
 373 <cmd name="begin"><parm>tabular</parm><parm>%s</parm></cmd>
 374     ''' % ('l' * max_col), \
 375     u'''<cmd name="end"><parm>tabular</parm></cmd>'''
 376
 377     @escape(1)
 378     def handle_wiersz(self, element):
 379         return u"", u'<ctrl ch="\\"/>'
 380
 381     @escape(1)
 382     def handle_kol(self, element):
 383         if element.getnext() is not None:
 384             return u"", u'<spec cat="align" />'
 385         return u"", u""
 386
 387     def handle_link(self, element):
 388         if element.attrib.get('url'):
 389             url = element.attrib.get('url')
 390             if url == element.text:
 391                 return cmd('url')(self, element)
 392             else:
 393                 return cmd('href', parms=[element.attrib['url']])(self, element)
 394         else:
 395             return cmd('emph')(self, element)
 396
 397     def handle_obraz(self, element):
 398         frmt = self.options['format']
 399         name = element.attrib.get('nazwa', '').strip()
 400         image = frmt.get_image(name.strip())
 401         img_path = "obraz/%s" % name.replace("_", "")
 402         frmt.attachments[img_path] = image
 403         return cmd("obraz", parms=[img_path])(self)
 404
 405     def handle_video(self, element):
 406         url = element.attrib.get('url')
 407         if not url:
 408             print '!! <video> missing url'
 409             return
 410         m = re.match(r'(?:https?://)?(?:www.)?youtube.com/watch\?(?:.*&)?v=([^&]+)(?:$|&)', url)
 411         if not m:
 412             print '!! unknown <video> url scheme:', url
 413             return
 414         name = m.group(1)
 415         thumb = IOFile.from_string(urlopen
 416             ("http://img.youtube.com/vi/%s/0.jpg" % name).read())
 417         img_path = "video/%s.jpg" % name.replace("_", "")
 418         self.options['format'].attachments[img_path] = thumb
 419         canon_url = "https://www.youtube.com/watch?v=%s" % name
 420         return cmd("video", parms=[img_path, canon_url])(self)
 421
 422
 423 class Exercise(EduModule):
 424     def __init__(self, *args, **kw):
 425         self.question_counter = 0
 426         super(Exercise, self).__init__(*args, **kw)
 427
 428     handle_rozw_kom = ifoption(teacher=True)(cmd('akap'))
 429
 430     def handle_cwiczenie(self, element):
 431         self.options = {
 432             'exercise': element.attrib['typ'],
 433             'sub_gen': True,
 434         }
 435         self.question_counter = 0
 436         self.piece_counter = 0
 437
 438         header = etree.Element("parm")
 439         header_cmd = etree.Element("cmd", name="naglowekpodrozdzial")
 440         header_cmd.append(header)
 441         header.text = u"Zadanie %d." % self.options['exercise_counter']
 442
 443         pre = etree.tostring(header_cmd, encoding=unicode)
 444         post = u""
 445         # Add a single <pytanie> tag if it's not there
 446         if not element.xpath(".//pytanie"):
 447             qpre, qpost = self.handle_pytanie(element)
 448             pre = pre + qpre
 449             post = qpost + post
 450         return pre, post
 451
 452     def handle_pytanie(self, element):
 453         """This will handle <cwiczenie> element, when there is no <pytanie>
 454         """
 455         self.question_counter += 1
 456         self.piece_counter = 0
 457         pre = post = u""
 458         if self.options['teacher'] and element.attrib.get('rozw'):
 459             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 460         return pre, post
 461
 462     def handle_punkt(self, element):
 463         pre, post = super(Exercise, self).handle_punkt(element)
 464         if self.options['teacher'] and element.attrib.get('rozw'):
 465             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 466         return pre, post
 467
 468     def solution_header(self):
 469         par = etree.Element("cmd", name="par")
 470         parm = etree.Element("parm")
 471         parm.text = u"Rozwiązanie:"
 472         par.append(parm)
 473         return etree.tostring(par)
 474
 475     def explicit_solution(self):
 476         if self.options['solution']:
 477             par = etree.Element("cmd", name="par")
 478             parm = etree.Element("parm")
 479             parm.text = self.options['solution']
 480             par.append(parm)
 481             return self.solution_header() + etree.tostring(par)
 482
 483
 484
 485 class Wybor(Exercise):
 486     def handle_cwiczenie(self, element):
 487         pre, post = super(Wybor, self).handle_cwiczenie(element)
 488         is_single_choice = True
 489         pytania = element.xpath(".//pytanie")
 490         if not pytania:
 491             pytania = [element]
 492         for p in pytania:
 493             solutions = re.split(r"[, ]+", p.attrib.get('rozw', ''))
 494             if len(solutions) != 1:
 495                 is_single_choice = False
 496                 break
 497             choices = p.xpath(".//*[@nazwa]")
 498             uniq = set()
 499             for n in choices: uniq.add(n.attrib.get('nazwa', ''))
 500             if len(choices) != len(uniq):
 501                 is_single_choice = False
 502                 break
 503
 504         self.options = {'single': is_single_choice}
 505         return pre, post
 506
 507     def handle_punkt(self, element):
 508         if self.options['exercise'] and element.attrib.get('nazwa', None):
 509             cmd = 'radio' if self.options['single'] else 'checkbox'
 510             return u'<cmd name="%s"/>' % cmd, ''
 511         else:
 512             return super(Wybor, self).handle_punkt(element)
 513
 514
 515 class Uporzadkuj(Exercise):
 516     def handle_pytanie(self, element):
 517         order_items = element.xpath(".//punkt/@rozw")
 518         return super(Uporzadkuj, self).handle_pytanie(element)
 519
 520
 521 class Przyporzadkuj(Exercise):
 522     def handle_lista(self, lista):
 523         header = etree.Element("parm")
 524         header_cmd = etree.Element("cmd", name="par")
 525         header_cmd.append(header)
 526         if 'nazwa' in lista.attrib:
 527             header.text = u"Kategorie:"
 528         elif 'cel' in lista.attrib:
 529             header.text = u"Elementy do przyporządkowania:"
 530         else:
 531             header.text = u"Lista:"
 532         pre, post = super(Przyporzadkuj, self).handle_lista(lista)
 533         pre = etree.tostring(header_cmd, encoding=unicode) + pre
 534         return pre, post
 535
 536
 537 class Luki(Exercise):
 538     def find_pieces(self, question):
 539         return question.xpath(".//luka")
 540
 541     def solution(self, piece):
 542         piece = deepcopy(piece)
 543         piece.tail = None
 544         sub = EduModule()
 545         return sub.generate(piece)
 546
 547     def handle_pytanie(self, element):
 548         qpre, qpost = super(Luki, self).handle_pytanie(element)
 549
 550         luki = self.find_pieces(element)
 551         random.shuffle(luki)
 552         self.words = u"<env name='itemize'>%s</env>" % (
 553             "".join("<cmd name='item'/>%s" % self.solution(luka) for luka in luki)
 554         )
 555         return qpre, qpost
 556
 557     def handle_opis(self, element):
 558         return '', self.words
 559
 560     def handle_luka(self, element):
 561         luka = "_" * 10
 562         if self.options['teacher']:
 563             piece = deepcopy(element)
 564             piece.tail = None
 565             sub = EduModule()
 566             text = sub.generate(piece)
 567             luka += u" [rozwiązanie: %s]" % text
 568         return luka
 569
 570
 571 class Zastap(Luki):
 572     def find_pieces(self, question):
 573         return question.xpath(".//zastap")
 574
 575     def solution(self, piece):
 576         return piece.attrib.get('rozw', '')
 577
 578     def list_header(self):
 579         return u"Elementy do wstawienia"
 580
 581     def handle_zastap(self, element):
 582         piece = deepcopy(element)
 583         piece.tail = None
 584         sub = EduModule()
 585         text = sub.generate(piece)
 586         if self.options['teacher'] and element.attrib.get('rozw'):
 587             text += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 588         return text
 589
 590
 591 class PrawdaFalsz(Exercise):
 592     def handle_punkt(self, element):
 593         pre, post = super(PrawdaFalsz, self).handle_punkt(element)
 594         if 'rozw' in element.attrib:
 595             post += u" [Prawda/Fałsz]"
 596         return pre, post
 597
 598
 599
 600 def fix_lists(tree):
 601     lists = tree.xpath(".//lista")
 602     for l in lists:
 603         if l.text:
 604             p = l.getprevious()
 605             if p is not None:
 606                 if p.tail is None: p.tail = ''
 607                 p.tail += l.text
 608             else:
 609                 p = l.getparent()
 610                 if p.text is None: p.text = ''
 611                 p.text += l.text
 612             l.text = ''
 613     return tree
 614
 615
 616 class EduModulePDFFormat(PDFFormat):
 617     style = get_resource('res/styles/edumed/pdf/edumed.sty')
 618
 619     def get_texml(self):
 620         substitute_hyphens(self.wldoc.edoc)
 621         fix_hanging(self.wldoc.edoc)
 622
 623         self.attachments = {}
 624         edumod = EduModule({
 625             "wldoc": self.wldoc,
 626             "format": self,
 627             "teacher": self.customization.get('teacher'),
 628         })
 629         texml = edumod.generate(fix_lists(self.wldoc.edoc.getroot())).encode('utf-8')
 630
 631         open("/tmp/texml.xml", "w").write(texml)
 632         return texml
 633
 634     def get_tex_dir(self):
 635         temp = super(EduModulePDFFormat, self).get_tex_dir()
 636         shutil.copy(get_resource('res/styles/edumed/logo.png'), temp)
 637         for name, iofile in self.attachments.items():
 638             iofile.save_as(os.path.join(temp, name))
 639         return temp
 640
 641     def get_image(self, name):
 642         return self.wldoc.source.attachments[name]
 643