X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/773612b400fb85103153032b193b1434d136a7ef..565aa32ff31bdbe6e2aaf9f47b124ac1f883a990:/librarian/pypdf.py diff --git a/librarian/pypdf.py b/librarian/pypdf.py index 4cc4d1d..9851cb1 100644 --- a/librarian/pypdf.py +++ b/librarian/pypdf.py @@ -9,27 +9,19 @@ Creates one big XML from the book and its children, converts it to LaTeX with TeXML, then runs it by XeLaTeX. """ -from __future__ import with_statement -import os +from copy import deepcopy import os.path import shutil -from StringIO import StringIO -from tempfile import mkdtemp, NamedTemporaryFile import re -from copy import deepcopy -from subprocess import call, PIPE +import random +from urllib2 import urlopen -from Texml.processor import process from lxml import etree -from lxml.etree import XMLSyntaxError, XSLTApplyError -from xmlutils import Xmill, tag, tagged, ifoption -from librarian.dcparser import Person -from librarian.parser import WLDocument -from librarian import ParseError, DCNS, get_resource, IOFile, Format +from xmlutils import Xmill, ifoption, tag_open_close +from librarian import DCNS, get_resource, IOFile from librarian import functions -from pdf import PDFFormat - +from pdf import PDFFormat, substitute_hyphens, fix_hanging def escape(really): @@ -40,7 +32,8 @@ def escape(really): prefix = (u'' % (really and 1 or 0)) postfix = u'' if isinstance(value, list): - import pdb; pdb.set_trace() + import pdb + pdb.set_trace() if isinstance(value, tuple): return prefix + value[0], value[1] + postfix else: @@ -49,15 +42,21 @@ def escape(really): return deco -def cmd(name, pass_text=False): - def wrap(self, element): - pre = u'' % name +def cmd(name, parms=None): + def wrap(self, element=None): + pre, post = tag_open_close('cmd', name=name) - if pass_text: - pre += "%s" % element.text - return pre + '' + if parms: + for parm in parms: + e = etree.Element("parm") + e.text = parm + pre += etree.tostring(e) + if element is not None: + pre += "" + post = "" + post + return pre, post else: - return pre, '' + return pre + post return wrap @@ -67,18 +66,19 @@ def mark_alien_characters(text): class EduModule(Xmill): - def __init__(self, options=None): - super(EduModule, self).__init__(options) + def __init__(self, options=None, state=None): + super(EduModule, self).__init__(options, state) self.activity_counter = 0 + self.activity_last = None self.exercise_counter = 0 def swap_endlines(txt): if self.options['strofa']: - txt = txt.replace("/\n", '') + txt = txt.replace("/\n", '') return txt + self.register_text_filter(swap_endlines) self.register_text_filter(functions.substitute_entities) self.register_text_filter(mark_alien_characters) - self.register_text_filter(swap_endlines) def get_dc(self, element, dc_field, single=False): values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri})) @@ -87,23 +87,26 @@ class EduModule(Xmill): return values def handle_rdf__RDF(self, _): - "skip metadata in generation" + """skip metadata in generation""" return @escape(True) def get_rightsinfo(self, element): rights_lic = self.get_dc(element, 'rights.license', True) - return u'' + \ - (rights_lic and u'%s' % rights_lic or '') +\ - u'%s' % self.get_dc(element, 'rights', True) +\ - u'' + return u'' + (rights_lic and u'%s' % rights_lic or '') + \ + u'%s' % self.get_dc(element, 'rights', True) + \ + u'' @escape(True) - def get_authors(self, element): - authors = self.get_dc(element, 'creator.expert') + \ - self.get_dc(element, 'creator.scenario') + \ - self.get_dc(element, 'creator.textbook') - return u', '.join(authors) + def get_authors(self, element, which=None): + dc = self.options['wldoc'].book_info + if which is None: + authors = dc.authors_textbook + \ + dc.authors_scenario + \ + dc.authors_expert + else: + authors = getattr(dc, "authors_%s" % which) + return u', '.join(author.readable() for author in authors if author) @escape(1) def get_title(self, element): @@ -112,33 +115,37 @@ class EduModule(Xmill): def handle_utwor(self, element): lines = [ u''' - - - \\documentclass[%s]{wl} - \\usepackage{style}''' % self.options['customization_str'], - self.options['has_cover'] and '\usepackage{makecover}', - (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or - (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or - (self.options['morefloats'] == 'none' and - u'''\\IfFileExists{morefloats.sty}{ - \\usepackage{morefloats} - }{}'''), - u'''\\def\\authors{%s}''' % self.get_authors(element), - u'''\\author{\\authors}''', - u'''\\title{%s}''' % self.get_title(element), - u'''\\def\\bookurl{%s}''' % self.get_dc(element, 'identifier.url', True), - u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element), - u''] + + + \\documentclass[%s]{wl} + \\usepackage{style}''' % self.options['customization_str'], + self.options['has_cover'] and '\usepackage{makecover}', + (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or + (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or + (self.options['morefloats'] == 'none' and + u'''\\IfFileExists{morefloats.sty}{ + \\usepackage{morefloats} + }{}'''), + u'''\\def\\authors{%s}''' % self.get_authors(element), + u'''\\def\\authorsexpert{%s}''' % self.get_authors(element, 'expert'), + u'''\\def\\authorsscenario{%s}''' % self.get_authors(element, 'scenario'), + u'''\\def\\authorstextbook{%s}''' % self.get_authors(element, 'textbook'), + + u'''\\author{\\authors}''', + u'''\\title{%s}''' % self.get_title(element), + u'''\\def\\bookurl{%s}''' % self.options['wldoc'].book_info.url.canonical(), + u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element), + u'' + ] return u"".join(filter(None, lines)), u'' - @escape(1) def handle_powiesc(self, element): return u""" - """, """""" + """, """""" @escape(1) def handle_texcommand(self, element): @@ -146,50 +153,77 @@ class EduModule(Xmill): return u'' % cmd, u'' handle_akap = \ - handle_akap = \ - handle_akap_cd = \ - handle_akap_cd = \ - handle_akap_dialog = \ - handle_akap_dialog = \ - handle_autor_utworu = \ - handle_dedykacja = \ - handle_didaskalia = \ - handle_didask_tekst = \ - handle_dlugi_cytat = \ - handle_dzielo_nadrzedne = \ - handle_lista_osoba = \ - handle_mat = \ - handle_miejsce_czas = \ - handle_motto = \ - handle_motto_podpis = \ - handle_naglowek_akt = \ - handle_naglowek_czesc = \ - handle_naglowek_listy = \ - handle_naglowek_osoba = \ - handle_naglowek_podrozdzial = \ - handle_naglowek_podrozdzial = \ - handle_naglowek_rozdzial = \ - handle_naglowek_rozdzial = \ - handle_naglowek_scena = \ - handle_nazwa_utworu = \ - handle_nota = \ - handle_osoba = \ - handle_pa = \ - handle_pe = \ - handle_podtytul = \ - handle_poezja_cyt = \ - handle_pr = \ - handle_pt = \ - handle_sekcja_asterysk = \ - handle_sekcja_swiatlo = \ - handle_separator_linia = \ - handle_slowo_obce = \ - handle_srodtytul = \ - handle_tytul_dziela = \ - handle_wyroznienie = \ - handle_texcommand - - _handle_strofa = cmd("strofa", True) + handle_akap_cd = \ + handle_akap_dialog = \ + handle_autor_utworu = \ + handle_dedykacja = \ + handle_didaskalia = \ + handle_didask_tekst = \ + handle_dlugi_cytat = \ + handle_dzielo_nadrzedne = \ + handle_lista_osoba = \ + handle_mat = \ + handle_miejsce_czas = \ + handle_motto = \ + handle_motto_podpis = \ + handle_naglowek_akt = \ + handle_naglowek_czesc = \ + handle_naglowek_listy = \ + handle_naglowek_osoba = \ + handle_naglowek_scena = \ + handle_nazwa_utworu = \ + handle_nota = \ + handle_osoba = \ + handle_pa = \ + handle_pe = \ + handle_podtytul = \ + handle_poezja_cyt = \ + handle_pr = \ + handle_pt = \ + handle_sekcja_asterysk = \ + handle_sekcja_swiatlo = \ + handle_separator_linia = \ + handle_slowo_obce = \ + handle_srodtytul = \ + handle_tytul_dziela = \ + handle_wyroznienie = \ + handle_dywiz = \ + handle_texcommand + + def handle_naglowek_rozdzial(self, element): + if not self.options['teacher']: + if element.text.startswith((u'Wiedza', u'Zadania', u'Słowniczek', u'Dla ucznia')): + self.state['mute'] = False + else: + self.state['mute'] = True + return None + return self.handle_texcommand(element) + handle_naglowek_rozdzial.unmuter = True + + def handle_naglowek_podrozdzial(self, element): + self.activity_counter = 0 + if not self.options['teacher']: + if element.text.startswith(u'Dla ucznia'): + self.state['mute'] = False + return None + elif element.text.startswith(u'Dla nauczyciela'): + self.state['mute'] = True + return None + elif self.state['mute']: + return None + return self.handle_texcommand(element) + handle_naglowek_podrozdzial.unmuter = True + + def handle_uwaga(self, _e): + return None + + def handle_extra(self, _e): + return None + + def handle_nbsp(self, _e): + return '' + + _handle_strofa = cmd("strofa") def handle_strofa(self, element): self.options = {'strofa': True} @@ -202,18 +236,24 @@ class EduModule(Xmill): 'activity_counter': self.activity_counter, 'sub_gen': True, } - submill = EduModule(self.options) + submill = EduModule(self.options, self.state) - opis = submill.generate(element.xpath('opis')[0]) + if element.xpath('opis'): + opis = submill.generate(element.xpath('opis')[0]) + else: + opis = '' n = element.xpath('wskazowki') - if n: wskazowki = submill.generate(n[0]) - - else: wskazowki = '' + if n: + wskazowki = submill.generate(n[0]) + else: + wskazowki = '' n = element.xpath('pomoce') - if n: pomoce = submill.generate(n[0]) - else: pomoce = '' + if n: + pomoce = submill.generate(n[0]) + else: + pomoce = '' forma = ''.join(element.xpath('forma/text()')) @@ -221,9 +261,16 @@ class EduModule(Xmill): counter = self.activity_counter - return u""" + if element.getnext().tag == 'aktywnosc' or (self.activity_last and self.activity_last.getnext() == element): + counter_tex = """%(counter)d.""" % locals() + else: + counter_tex = '' + + self.activity_last = element -%(counter)d. + return u""" + +%(counter_tex)s %(czas)s %(forma)s @@ -249,20 +296,28 @@ class EduModule(Xmill): def handle_forma(self, *_): return - def handle_lista(self, element, attrs={}): + def handle_lista(self, element, attrs=None): ltype = element.attrib.get('typ', 'punkt') + if not element.findall("punkt"): + if ltype == 'czytelnia': + return 'W przygotowaniu.' + else: + return None if ltype == 'slowniczek': - surl = element.attrib.get('href', None) - sxml = None - if surl: - sxml = etree.fromstring(self.options['provider'].by_uri(surl).get_string()) - self.options = {'slowniczek': True, 'slowniczek_xml': sxml } - - listcmd = {'num': 'enumerate', - 'punkt': 'itemize', - 'alfa': 'itemize', - 'slowniczek': 'itemize', - 'czytelnia': 'itemize'}[ltype] + surl = element.attrib.get('src', None) + if surl is None: + # print '** missing src on , setting default' + surl = 'http://edukacjamedialna.edu.pl/lekcje/slowniczek/' + sxml = etree.fromstring(self.options['wldoc'].provider.by_uri(surl).get_string()) + self.options = {'slowniczek': True, 'slowniczek_xml': sxml} + + listcmd = { + 'num': 'enumerate', + 'punkt': 'itemize', + 'alfa': 'itemize', + 'slowniczek': 'itemize', + 'czytelnia': 'itemize' + }[ltype] return u'' % listcmd, u'' @@ -271,17 +326,20 @@ class EduModule(Xmill): def handle_cwiczenie(self, element): exercise_handlers = { - 'wybor': Wybor} - # 'uporzadkuj': Uporzadkuj, - # 'luki': Luki, - # 'zastap': Zastap, - # 'przyporzadkuj': Przyporzadkuj, - # 'prawdafalsz': PrawdaFalsz + 'wybor': Wybor, + 'uporzadkuj': Uporzadkuj, + 'luki': Luki, + 'zastap': Zastap, + 'przyporzadkuj': Przyporzadkuj, + 'prawdafalsz': PrawdaFalsz + } typ = element.attrib['typ'] - if not typ in exercise_handlers: + self.exercise_counter += 1 + if typ not in exercise_handlers: return '(no handler)' - handler = exercise_handlers[typ](self.options) + self.options = {'exercise_counter': self.exercise_counter} + handler = exercise_handlers[typ](self.options, self.state) return handler.generate(element) # XXX this is copied from pyhtml.py, except for return and @@ -291,14 +349,17 @@ class EduModule(Xmill): definiens_s = '' # let's pull definiens from another document - if self.options['slowniczek_xml'] and (not nxt or nxt.tag != 'definiens'): + if self.options['slowniczek_xml'] is not None and (nxt is None or nxt.tag != 'definiens'): sxml = self.options['slowniczek_xml'] assert element.text != '' - defloc = sxml.xpath("//definiendum[text()='%s']" % element.text) + if "'" in (element.text or ''): + defloc = sxml.xpath("//definiendum[text()=\"%s\"]" % (element.text or '').strip()) + else: + defloc = sxml.xpath("//definiendum[text()='%s']" % (element.text or '').strip()) if defloc: definiens = defloc[0].getnext() if definiens.tag == 'definiens': - subgen = EduModule(self.options) + subgen = EduModule(self.options, self.state) definiens_s = subgen.generate(definiens) return u'', u": " + definiens_s @@ -317,14 +378,13 @@ class EduModule(Xmill): max_col = len(ks) self.options = {'columnts': max_col} # styling: - # has_frames = int(element.attrib.get("ramki", "0")) - # if has_frames: frames_c = "framed" - # else: frames_c = "" - # return u"""""" % frames_c, u"
" + # has_frames = int(element.attrib.get("ramki", "0")) + # if has_frames: frames_c = "framed" + # else: frames_c = "" + # return u"""""" % frames_c, u"
" return u''' -tabular%s - ''' % ('l' * max_col), \ - u'''tabular''' +tabular%s + ''' % ('l' * max_col), u'''tabular''' @escape(1) def handle_wiersz(self, element): @@ -333,59 +393,108 @@ class EduModule(Xmill): @escape(1) def handle_kol(self, element): if element.getnext() is not None: - return u"", u'' + return u"", u'' return u"", u"" - handle_link = cmd('em', True) + def handle_link(self, element): + if element.attrib.get('url'): + url = element.attrib.get('url') + if url == element.text: + return cmd('url')(self, element) + else: + return cmd('href', parms=[element.attrib['url']])(self, element) + else: + return cmd('emph')(self, element) + + def handle_obraz(self, element): + frmt = self.options['format'] + name = element.attrib.get('nazwa', '').strip() + image = frmt.get_image(name.strip()) + name = image.get_filename().rsplit('/', 1)[-1] + img_path = "obraz/%s" % name.replace("_", "") + frmt.attachments[img_path] = image + return cmd("obraz", parms=[img_path])(self) + + def handle_video(self, element): + url = element.attrib.get('url') + if not url: + print '!!