librarian/pypdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from copy import deepcopy
  13 import os.path
  14 import shutil
  15 import re
  16 import random
  17 from urllib2 import urlopen
  18
  19 from lxml import etree
  20
  21 from xmlutils import Xmill, ifoption, tag_open_close
  22 from librarian import DCNS, get_resource, IOFile
  23 from librarian import functions
  24 from pdf import PDFFormat, substitute_hyphens, fix_hanging
  25
  26
  27 def escape(really):
  28     def deco(f):
  29         def _wrap(*args, **kw):
  30             value = f(*args, **kw)
  31
  32             prefix = (u'<TeXML escape="%d">' % (1 if really else 0))
  33             postfix = u'</TeXML>'
  34             if isinstance(value, list):
  35                 import pdb
  36                 pdb.set_trace()
  37             if isinstance(value, tuple):
  38                 return prefix + value[0], value[1] + postfix
  39             else:
  40                 return prefix + value + postfix
  41         return _wrap
  42     return deco
  43
  44
  45 def cmd(name, parms=None):
  46     def wrap(self, element=None):
  47         pre, post = tag_open_close('cmd', name=name)
  48
  49         if parms:
  50             for parm in parms:
  51                 e = etree.Element("parm")
  52                 e.text = parm
  53                 pre += etree.tostring(e)
  54         if element is not None:
  55             pre += "<parm>"
  56             post = "</parm>" + post
  57             return pre, post
  58         else:
  59             return pre + post
  60     return wrap
  61
  62
  63 def mark_alien_characters(text):
  64     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  65     return text
  66
  67
  68 class EduModule(Xmill):
  69     def __init__(self, options=None, state=None):
  70         super(EduModule, self).__init__(options, state)
  71         self.activity_counter = 0
  72         self.activity_last = None
  73         self.exercise_counter = 0
  74
  75         def swap_endlines(txt):
  76             if self.options['strofa']:
  77                 txt = txt.replace("/\n", '<ctrl ch="\\"/>')
  78             return txt
  79         self.register_text_filter(swap_endlines)
  80         self.register_text_filter(functions.substitute_entities)
  81         self.register_text_filter(mark_alien_characters)
  82
  83     def get_dc(self, element, dc_field, single=False):
  84         values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri}))
  85         if single:
  86             return values[0] if len(values) else ''
  87         return values
  88
  89     def handle_rdf__RDF(self, _):
  90         """skip metadata in generation"""
  91         return
  92
  93     @escape(True)
  94     def get_rightsinfo(self, element):
  95         rights_lic = self.get_dc(element, 'rights.license', True)
  96         return u'<cmd name="rightsinfostr">' + (rights_lic and u'<opt>%s</opt>' % rights_lic or '') + \
  97             u'<parm>%s</parm>' % self.get_dc(element, 'rights', True) + \
  98             u'</cmd>'
  99
 100     @escape(True)
 101     def get_authors(self, element, which=None):
 102         dc = self.options['wldoc'].book_info
 103         if which is None:
 104             authors = dc.authors_textbook + \
 105                 dc.authors_scenario + \
 106                 dc.authors_expert
 107         else:
 108             authors = getattr(dc, "authors_%s" % which)
 109         return u', '.join(author.readable() for author in authors if author)
 110
 111     @escape(True)
 112     def get_title(self, element):
 113         return self.get_dc(element, 'title', True)
 114
 115     @escape(True)
 116     def get_description(self, element):
 117         desc = self.get_dc(element, 'description', single=True)
 118         if not desc:
 119             print '!! no descripton'
 120         return desc
 121
 122     def handle_utwor(self, element):
 123         lines = [
 124             u'''
 125                 <TeXML xmlns="http://getfo.sourceforge.net/texml/ns1">
 126                 <TeXML escape="0">
 127                 \\documentclass[%s]{wl}
 128                 \\usepackage{style}''' % self.options['customization_str'],
 129             self.options['has_cover'] and '\usepackage{makecover}',
 130             (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or
 131             (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or
 132             (self.options['morefloats'] == 'none' and
 133                 u'''\\IfFileExists{morefloats.sty}{
 134                 \\usepackage{morefloats}
 135                 }{}'''),
 136             u'''\\def\\authors{%s}''' % self.get_authors(element),
 137             u'''\\def\\authorsexpert{%s}''' % self.get_authors(element, 'expert'),
 138             u'''\\def\\authorsscenario{%s}''' % self.get_authors(element, 'scenario'),
 139             u'''\\def\\authorstextbook{%s}''' % self.get_authors(element, 'textbook'),
 140             u'''\\def\\description{%s}''' % self.get_description(element),
 141
 142             u'''\\author{\\authors}''',
 143             u'''\\title{%s}''' % self.get_title(element),
 144             u'''\\def\\bookurl{%s}''' % self.options['wldoc'].book_info.url.canonical(),
 145             u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element),
 146             u'</TeXML>'
 147         ]
 148
 149         return u"".join(filter(None, lines)), u'</TeXML>'
 150
 151     @escape(True)
 152     def handle_powiesc(self, element):
 153         return u"""
 154     <env name="document">
 155     <cmd name="maketitle"/>
 156     """, """<cmd name="editorialsection" /></env>"""
 157
 158     @escape(True)
 159     def handle_texcommand(self, element):
 160         cmd = functions.texcommand(element.tag)
 161         return u'<TeXML escape="1"><cmd name="%s"><parm>' % cmd, u'</parm></cmd></TeXML>'
 162
 163     handle_akap = \
 164         handle_akap_cd = \
 165         handle_akap_dialog = \
 166         handle_autor_utworu = \
 167         handle_dedykacja = \
 168         handle_didaskalia = \
 169         handle_didask_tekst = \
 170         handle_dlugi_cytat = \
 171         handle_dzielo_nadrzedne = \
 172         handle_lista_osoba = \
 173         handle_mat = \
 174         handle_miejsce_czas = \
 175         handle_motto = \
 176         handle_motto_podpis = \
 177         handle_naglowek_akt = \
 178         handle_naglowek_czesc = \
 179         handle_naglowek_listy = \
 180         handle_naglowek_osoba = \
 181         handle_naglowek_scena = \
 182         handle_nazwa_utworu = \
 183         handle_nota = \
 184         handle_osoba = \
 185         handle_pa = \
 186         handle_pe = \
 187         handle_podtytul = \
 188         handle_poezja_cyt = \
 189         handle_pr = \
 190         handle_pt = \
 191         handle_sekcja_asterysk = \
 192         handle_sekcja_swiatlo = \
 193         handle_separator_linia = \
 194         handle_slowo_obce = \
 195         handle_srodtytul = \
 196         handle_tytul_dziela = \
 197         handle_wyroznienie = \
 198         handle_dywiz = \
 199         handle_texcommand
 200
 201     def handle_naglowek_rozdzial(self, element):
 202         if not self.options['teacher']:
 203             if element.text.startswith((u'Wiedza', u'Zadania', u'Słowniczek', u'Dla ucznia')):
 204                 self.state['mute'] = False
 205             else:
 206                 self.state['mute'] = True
 207                 return None
 208         return self.handle_texcommand(element)
 209     handle_naglowek_rozdzial.unmuter = True
 210
 211     def handle_naglowek_podrozdzial(self, element):
 212         self.activity_counter = 0
 213         if not self.options['teacher']:
 214             if element.text.startswith(u'Dla ucznia'):
 215                 self.state['mute'] = False
 216                 return None
 217             elif element.text.startswith(u'Dla nauczyciela'):
 218                 self.state['mute'] = True
 219                 return None
 220             elif self.state['mute']:
 221                 return None
 222         return self.handle_texcommand(element)
 223     handle_naglowek_podrozdzial.unmuter = True
 224
 225     def handle_uwaga(self, _e):
 226         return None
 227
 228     def handle_extra(self, _e):
 229         return None
 230
 231     def handle_nbsp(self, _e):
 232         return '<spec cat="tilde" />'
 233
 234     _handle_strofa = cmd("strofa")
 235
 236     def handle_strofa(self, element):
 237         self.options = {'strofa': True}
 238         return self._handle_strofa(element)
 239
 240     def handle_aktywnosc(self, element):
 241         self.activity_counter += 1
 242         self.options = {
 243             'activity': True,
 244             'activity_counter': self.activity_counter,
 245             'sub_gen': True,
 246         }
 247         submill = EduModule(self.options, self.state)
 248
 249         if element.xpath('opis'):
 250             opis = submill.generate(element.xpath('opis')[0])
 251         else:
 252             opis = ''
 253
 254         n = element.xpath('wskazowki')
 255         if n:
 256             wskazowki = submill.generate(n[0])
 257         else:
 258             wskazowki = ''
 259         n = element.xpath('pomoce')
 260
 261         if n:
 262             pomoce = submill.generate(n[0])
 263         else:
 264             pomoce = ''
 265
 266         forma = ''.join(element.xpath('forma/text()'))
 267
 268         czas = ''.join(element.xpath('czas/text()'))
 269
 270         counter = self.activity_counter
 271
 272         if element.getnext().tag == 'aktywnosc' or (len(self.activity_last) and self.activity_last.getnext() == element):
 273             counter_tex = """<cmd name="activitycounter"><parm>%(counter)d.</parm></cmd>""" % locals()
 274         else:
 275             counter_tex = ''
 276
 277         self.activity_last = element
 278
 279         return u"""
 280 <cmd name="noindent" />
 281 %(counter_tex)s
 282 <cmd name="activityinfo"><parm>
 283  <cmd name="activitytime"><parm>%(czas)s</parm></cmd>
 284  <cmd name="activityform"><parm>%(forma)s</parm></cmd>
 285  <cmd name="activitytools"><parm>%(pomoce)s</parm></cmd>
 286 </parm></cmd>
 287
 288
 289 %(opis)s
 290
 291 %(wskazowki)s
 292 """ % locals()
 293
 294     handle_opis = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 295     handle_wskazowki = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 296
 297     @ifoption(sub_gen=True)
 298     def handle_pomoce(self, _):
 299         return "Pomoce: ", ""
 300
 301     def handle_czas(self, *_):
 302         return
 303
 304     def handle_forma(self, *_):
 305         return
 306
 307     def handle_lista(self, element, attrs=None):
 308         ltype = element.attrib.get('typ', 'punkt')
 309         if not element.findall("punkt"):
 310             if ltype == 'czytelnia':
 311                 return 'W przygotowaniu.'
 312             else:
 313                 return None
 314         if ltype == 'slowniczek':
 315             surl = element.attrib.get('src', None)
 316             if surl is None:
 317                 # print '** missing src on <slowniczek>, setting default'
 318                 surl = 'http://edukacjamedialna.edu.pl/lekcje/slowniczek/'
 319             sxml = etree.fromstring(self.options['wldoc'].provider.by_uri(surl).get_string())
 320             self.options = {'slowniczek': True, 'slowniczek_xml': sxml}
 321
 322         listcmd = {
 323             'num': 'enumerate',
 324             'punkt': 'itemize',
 325             'alfa': 'itemize',
 326             'slowniczek': 'itemize',
 327             'czytelnia': 'itemize'
 328         }[ltype]
 329
 330         return u'<env name="%s">' % listcmd, u'</env>'
 331
 332     def handle_punkt(self, element):
 333         return '<cmd name="item"/>', ''
 334
 335     def handle_cwiczenie(self, element):
 336         exercise_handlers = {
 337             'wybor': Wybor,
 338             'uporzadkuj': Uporzadkuj,
 339             'luki': Luki,
 340             'zastap': Zastap,
 341             'przyporzadkuj': Przyporzadkuj,
 342             'prawdafalsz': PrawdaFalsz
 343         }
 344
 345         typ = element.attrib['typ']
 346         self.exercise_counter += 1
 347         if typ not in exercise_handlers:
 348             return '(no handler)'
 349         self.options = {'exercise_counter': self.exercise_counter}
 350         handler = exercise_handlers[typ](self.options, self.state)
 351         return handler.generate(element)
 352
 353     # XXX this is copied from pyhtml.py, except for return and
 354     # should be refactored for no code duplication
 355     def handle_definiendum(self, element):
 356         nxt = element.getnext()
 357         definiens_s = ''
 358
 359         # let's pull definiens from another document
 360         if self.options['slowniczek_xml'] is not None and (nxt is None or nxt.tag != 'definiens'):
 361             sxml = self.options['slowniczek_xml']
 362             assert element.text != ''
 363             if "'" in (element.text or ''):
 364                 defloc = sxml.xpath("//definiendum[text()=\"%s\"]" % (element.text or '').strip())
 365             else:
 366                 defloc = sxml.xpath("//definiendum[text()='%s']" % (element.text or '').strip())
 367             if defloc:
 368                 definiens = defloc[0].getnext()
 369                 if definiens.tag == 'definiens':
 370                     subgen = EduModule(self.options, self.state)
 371                     definiens_s = subgen.generate(definiens)
 372
 373         return u'<cmd name="textbf"><parm>', u"</parm></cmd>: " + definiens_s
 374
 375     def handle_definiens(self, element):
 376         return u"", u""
 377
 378     def handle_podpis(self, element):
 379         return u"""<env name="figure">""", u"</env>"
 380
 381     def handle_tabela(self, element):
 382         max_col = 0
 383         for w in element.xpath("wiersz"):
 384             ks = w.xpath("kol")
 385             if max_col < len(ks):
 386                 max_col = len(ks)
 387         self.options = {'columnts': max_col}
 388         # styling:
 389         #     has_frames = int(element.attrib.get("ramki", "0"))
 390         #     if has_frames: frames_c = "framed"
 391         #     else: frames_c = ""
 392         #     return u"""<table class="%s">""" % frames_c, u"</table>"
 393         return u'''
 394 <cmd name="begin"><parm>tabular</parm><parm>%s</parm></cmd>
 395     ''' % ('l' * max_col), u'''<cmd name="end"><parm>tabular</parm></cmd>'''
 396
 397     @escape(True)
 398     def handle_wiersz(self, element):
 399         return u"", u'<ctrl ch="\\"/>'
 400
 401     @escape(True)
 402     def handle_kol(self, element):
 403         if element.getnext() is not None:
 404             return u"", u'<spec cat="align" />'
 405         return u"", u""
 406
 407     def handle_link(self, element):
 408         if element.attrib.get('url'):
 409             url = element.attrib.get('url')
 410             if url == element.text:
 411                 return cmd('url')(self, element)
 412             else:
 413                 return cmd('href', parms=[element.attrib['url']])(self, element)
 414         else:
 415             return cmd('emph')(self, element)
 416
 417     def handle_obraz(self, element):
 418         frmt = self.options['format']
 419         name = element.attrib.get('nazwa', '').strip()
 420         image = frmt.get_image(name.strip())
 421         name = image.get_filename().rsplit('/', 1)[-1]
 422         img_path = "obraz/%s" % name.replace("_", "")
 423         frmt.attachments[img_path] = image
 424         return cmd("obraz", parms=[img_path])(self)
 425
 426     def handle_video(self, element):
 427         url = element.attrib.get('url')
 428         if not url:
 429             print '!! <video> missing url'
 430             return
 431         m = re.match(r'(?:https?://)?(?:www.)?youtube.com/watch\?(?:.*&)?v=([^&]+)(?:$|&)', url)
 432         if not m:
 433             print '!! unknown <video> url scheme:', url
 434             return
 435         name = m.group(1)
 436         thumb = IOFile.from_string(urlopen("http://img.youtube.com/vi/%s/0.jpg" % name).read())
 437         img_path = "video/%s.jpg" % name.replace("_", "")
 438         self.options['format'].attachments[img_path] = thumb
 439         canon_url = "https://www.youtube.com/watch?v=%s" % name
 440         return cmd("video", parms=[img_path, canon_url])(self)
 441
 442
 443 class Exercise(EduModule):
 444     def __init__(self, *args, **kw):
 445         self.question_counter = 0
 446         super(Exercise, self).__init__(*args, **kw)
 447         self.piece_counter = None
 448
 449     handle_rozw_kom = ifoption(teacher=True)(cmd('akap'))
 450
 451     def handle_cwiczenie(self, element):
 452         self.options = {
 453             'exercise': element.attrib['typ'],
 454             'sub_gen': True,
 455         }
 456         self.question_counter = 0
 457         self.piece_counter = 0
 458
 459         header = etree.Element("parm")
 460         header_cmd = etree.Element("cmd", name="naglowekpodrozdzial")
 461         header_cmd.append(header)
 462         header.text = u"Zadanie %d." % self.options['exercise_counter']
 463
 464         pre = etree.tostring(header_cmd, encoding=unicode)
 465         post = u""
 466         # Add a single <pytanie> tag if it's not there
 467         if not element.xpath(".//pytanie"):
 468             qpre, qpost = self.handle_pytanie(element)
 469             pre += qpre
 470             post = qpost + post
 471         return pre, post
 472
 473     def handle_pytanie(self, element):
 474         """This will handle <cwiczenie> element, when there is no <pytanie>
 475         """
 476         self.question_counter += 1
 477         self.piece_counter = 0
 478         pre = post = u""
 479         if self.options['teacher'] and element.attrib.get('rozw'):
 480             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 481         return pre, post
 482
 483     def handle_punkt(self, element):
 484         pre, post = super(Exercise, self).handle_punkt(element)
 485         if self.options['teacher'] and element.attrib.get('rozw'):
 486             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 487         return pre, post
 488
 489     def solution_header(self):
 490         par = etree.Element("cmd", name="par")
 491         parm = etree.Element("parm")
 492         parm.text = u"Rozwiązanie:"
 493         par.append(parm)
 494         return etree.tostring(par)
 495
 496     def explicit_solution(self):
 497         if self.options['solution']:
 498             par = etree.Element("cmd", name="par")
 499             parm = etree.Element("parm")
 500             parm.text = self.options['solution']
 501             par.append(parm)
 502             return self.solution_header() + etree.tostring(par)
 503
 504
 505 class Wybor(Exercise):
 506     def handle_cwiczenie(self, element):
 507         pre, post = super(Wybor, self).handle_cwiczenie(element)
 508         is_single_choice = True
 509         pytania = element.xpath(".//pytanie")
 510         if not pytania:
 511             pytania = [element]
 512         for p in pytania:
 513             solutions = re.split(r"[, ]+", p.attrib.get('rozw', ''))
 514             if len(solutions) != 1:
 515                 is_single_choice = False
 516                 break
 517             choices = p.xpath(".//*[@nazwa]")
 518             uniq = set()
 519             for n in choices:
 520                 uniq.add(n.attrib.get('nazwa', ''))
 521             if len(choices) != len(uniq):
 522                 is_single_choice = False
 523                 break
 524
 525         self.options = {'single': is_single_choice}
 526         return pre, post
 527
 528     def handle_punkt(self, element):
 529         if self.options['exercise'] and element.attrib.get('nazwa', None):
 530             cmd = 'radio' if self.options['single'] else 'checkbox'
 531             return u'<cmd name="%s"/>' % cmd, ''
 532         else:
 533             return super(Wybor, self).handle_punkt(element)
 534
 535
 536 class Uporzadkuj(Exercise):
 537     def handle_pytanie(self, element):
 538         order_items = element.xpath(".//punkt/@rozw")
 539         return super(Uporzadkuj, self).handle_pytanie(element)
 540
 541
 542 class Przyporzadkuj(Exercise):
 543     def handle_lista(self, lista):
 544         header = etree.Element("parm")
 545         header_cmd = etree.Element("cmd", name="par")
 546         header_cmd.append(header)
 547         if 'nazwa' in lista.attrib:
 548             header.text = u"Kategorie:"
 549         elif 'cel' in lista.attrib:
 550             header.text = u"Elementy do przyporządkowania:"
 551         else:
 552             header.text = u"Lista:"
 553         pre, post = super(Przyporzadkuj, self).handle_lista(lista)
 554         pre = etree.tostring(header_cmd, encoding=unicode) + pre
 555         return pre, post
 556
 557
 558 class Luki(Exercise):
 559     def find_pieces(self, question):
 560         return question.xpath(".//luka")
 561
 562     def solution(self, piece):
 563         piece = deepcopy(piece)
 564         piece.tail = None
 565         sub = EduModule()
 566         return sub.generate(piece)
 567
 568     def handle_pytanie(self, element):
 569         qpre, qpost = super(Luki, self).handle_pytanie(element)
 570
 571         luki = self.find_pieces(element)
 572         random.shuffle(luki)
 573         self.words = u"<env name='itemize'>%s</env>" % (
 574             "".join("<cmd name='item'/>%s" % self.solution(luka) for luka in luki)
 575         )
 576         return qpre, qpost
 577
 578     def handle_opis(self, element):
 579         return '', self.words
 580
 581     def handle_luka(self, element):
 582         luka = "_" * 10
 583         if self.options['teacher']:
 584             piece = deepcopy(element)
 585             piece.tail = None
 586             sub = EduModule()
 587             text = sub.generate(piece)
 588             luka += u" [rozwiązanie: %s]" % text
 589         return luka
 590
 591
 592 class Zastap(Luki):
 593     def find_pieces(self, question):
 594         return question.xpath(".//zastap")
 595
 596     def solution(self, piece):
 597         return piece.attrib.get('rozw', '')
 598
 599     def list_header(self):
 600         return u"Elementy do wstawienia"
 601
 602     def handle_zastap(self, element):
 603         piece = deepcopy(element)
 604         piece.tail = None
 605         sub = EduModule()
 606         text = sub.generate(piece)
 607         if self.options['teacher'] and element.attrib.get('rozw'):
 608             text += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 609         return text
 610
 611
 612 class PrawdaFalsz(Exercise):
 613     def handle_punkt(self, element):
 614         pre, post = super(PrawdaFalsz, self).handle_punkt(element)
 615         if 'rozw' in element.attrib:
 616             post += u" [Prawda/Fałsz]"
 617         return pre, post
 618
 619
 620 def fix_lists(tree):
 621     lists = tree.xpath(".//lista")
 622     for l in lists:
 623         if l.text:
 624             p = l.getprevious()
 625             if p is not None:
 626                 if p.tail is None:
 627                     p.tail = ''
 628                 p.tail += l.text
 629             else:
 630                 p = l.getparent()
 631                 if p.text is None:
 632                     p.text = ''
 633                 p.text += l.text
 634             l.text = ''
 635     return tree
 636
 637
 638 class EduModulePDFFormat(PDFFormat):
 639     style = get_resource('res/styles/edumed/pdf/edumed.sty')
 640
 641     def get_texml(self):
 642         substitute_hyphens(self.wldoc.edoc)
 643         fix_hanging(self.wldoc.edoc)
 644
 645         self.attachments = {}
 646         edumod = EduModule({
 647             "wldoc": self.wldoc,
 648             "format": self,
 649             "teacher": self.customization.get('teacher'),
 650         })
 651         texml = edumod.generate(fix_lists(self.wldoc.edoc.getroot())).encode('utf-8')
 652
 653         open("/tmp/texml.xml", "w").write(texml)
 654         return texml
 655
 656     def get_tex_dir(self):
 657         temp = super(EduModulePDFFormat, self).get_tex_dir()
 658         shutil.copy(get_resource('res/styles/edumed/logo.png'), temp)
 659         for name, iofile in self.attachments.items():
 660             iofile.save_as(os.path.join(temp, name))
 661         return temp
 662
 663     def get_image(self, name):
 664         return self.wldoc.source.attachments[name]