librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7 import os
   8 import os.path
   9 import shutil
  10 from StringIO import StringIO
  11 from tempfile import mkdtemp, NamedTemporaryFile
  12 import re
  13 from copy import deepcopy
  14 from subprocess import call, PIPE
  15
  16 from Texml.processor import process
  17 from lxml import etree
  18 from lxml.etree import XMLSyntaxError, XSLTApplyError
  19
  20 from librarian.dcparser import Person
  21 from librarian.parser import WLDocument
  22 from librarian import ParseError, DCNS, get_resource, OutputFile
  23 from librarian import functions
  24 from librarian.cover import ImageCover as WLCover
  25
  26
  27 functions.reg_substitute_entities()
  28 functions.reg_strip()
  29 functions.reg_starts_white()
  30 functions.reg_ends_white()
  31 functions.reg_texcommand()
  32
  33 STYLESHEETS = {
  34     'wl2tex': 'pdf/wl2tex.xslt',
  35 }
  36
  37 #CUSTOMIZATIONS = [
  38 #    'nofootnotes',
  39 #    'nothemes',
  40 #    'defaultleading',
  41 #    'onehalfleading',
  42 #    'doubleleading',
  43 #    'nowlfont',
  44 #    ]
  45
  46 def insert_tags(doc, split_re, tagname, exclude=None):
  47     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  48
  49     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  50     >>> insert_tags(t, re.compile('-'), 'd');
  51     >>> print etree.tostring(t)
  52     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  53     """
  54
  55     for elem in doc.iter(tag=etree.Element):
  56         if exclude and elem.tag in exclude:
  57             continue
  58         if elem.text:
  59             chunks = split_re.split(elem.text)
  60             while len(chunks) > 1:
  61                 ins = etree.Element(tagname)
  62                 ins.tail = chunks.pop()
  63                 elem.insert(0, ins)
  64             elem.text = chunks.pop(0)
  65         if elem.tail:
  66             chunks = split_re.split(elem.tail)
  67             parent = elem.getparent()
  68             ins_index = parent.index(elem) + 1
  69             while len(chunks) > 1:
  70                 ins = etree.Element(tagname)
  71                 ins.tail = chunks.pop()
  72                 parent.insert(ins_index, ins)
  73             elem.tail = chunks.pop(0)
  74
  75
  76 def substitute_hyphens(doc):
  77     insert_tags(doc,
  78                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  79                 "dywiz",
  80                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  81                 )
  82
  83
  84 def fix_hanging(doc):
  85     insert_tags(doc,
  86                 re.compile("(?<=\s\w)\s+"),
  87                 "nbsp",
  88                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  89                 )
  90
  91
  92 def move_motifs_inside(doc):
  93     """ moves motifs to be into block elements """
  94     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
  95         for motif in master.xpath('motyw'):
  96             for sib in motif.itersiblings():
  97                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
  98                     # motif shouldn't have a tail - it would be untagged text
  99                     motif.tail = None
 100                     motif.getparent().remove(motif)
 101                     sib.insert(0, motif)
 102                     break
 103
 104
 105 def hack_motifs(doc):
 106     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 107     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 108
 109     moves motifs in stanzas from first verse to second
 110     and from next to last to last, then inserts negative vspace before them
 111     """
 112     for motif in doc.findall('//strofa//motyw'):
 113         # find relevant verse-level tag
 114         verse, stanza = motif, motif.getparent()
 115         while stanza is not None and stanza.tag != 'strofa':
 116             verse, stanza = stanza, stanza.getparent()
 117         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 118         breaks_after = sum(1 for i in verse.itersiblings('br'))
 119         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 120             move_by = 1
 121             if breaks_after == 2:
 122                 move_by += 1
 123             moved_motif = deepcopy(motif)
 124             motif.tag = 'span'
 125             motif.text = None
 126             moved_motif.tail = None
 127             moved_motif.set('moved', str(move_by))
 128
 129             for br in verse.itersiblings('br'):
 130                 if move_by > 1:
 131                     move_by -= 1
 132                     continue
 133                 br.addnext(moved_motif)
 134                 break
 135
 136
 137 def parse_creator(doc):
 138     """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
 139     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 140                     'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
 141                     namespaces = {'dc': str(DCNS)})[::-1]:
 142         if not person.text:
 143             continue
 144         p = Person.from_text(person.text)
 145         person_parsed = deepcopy(person)
 146         person_parsed.tag = person.tag + '_parsed'
 147         person_parsed.set('sortkey', person.text)
 148         person_parsed.text = p.readable()
 149         person.getparent().insert(0, person_parsed)
 150
 151
 152 def get_stylesheet(name):
 153     return get_resource(STYLESHEETS[name])
 154
 155
 156 def package_available(package, args='', verbose=False):
 157     """ check if a verion of a latex package accepting given args is available """
 158     tempdir = mkdtemp('-wl2pdf-test')
 159     fpath = os.path.join(tempdir, 'test.tex')
 160     f = open(fpath, 'w')
 161     f.write(r"""
 162         \documentclass{wl}
 163         \usepackage[%s]{%s}
 164         \begin{document}
 165         \end{document}
 166         """ % (args, package))
 167     f.close()
 168     if verbose:
 169         p = call(['xelatex', '-output-directory', tempdir, fpath])
 170     else:
 171         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 172     shutil.rmtree(tempdir)
 173     return p == 0
 174
 175
 176 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 177               cover=None, flags=None, customizations=None):
 178     """ produces a PDF file with XeLaTeX
 179
 180     wldoc: a WLDocument
 181     verbose: prints all output from LaTeX
 182     save_tex: path to save the intermediary LaTeX file to
 183     morefloats (old/new/none): force specific morefloats
 184     cover: a cover.Cover object
 185     flags: less-advertising,
 186     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 187     """
 188
 189     # Parse XSLT
 190     try:
 191         document = load_including_children(wldoc)
 192
 193         if cover:
 194             if cover is True:
 195                 cover = WLCover
 196             the_cover = cover(document.book_info)
 197             document.edoc.getroot().set('data-cover-width', str(the_cover.width))
 198             document.edoc.getroot().set('data-cover-height', str(the_cover.height))
 199             if the_cover.uses_dc_cover:
 200                 if document.book_info.cover_by:
 201                     document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
 202                 if document.book_info.cover_source:
 203                     document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
 204         if flags:
 205             for flag in flags:
 206                 document.edoc.getroot().set('flag-' + flag, 'yes')
 207
 208         # check for LaTeX packages
 209         if morefloats:
 210             document.edoc.getroot().set('morefloats', morefloats.lower())
 211         elif package_available('morefloats', 'maxfloats=19'):
 212             document.edoc.getroot().set('morefloats', 'new')
 213
 214         # add customizations
 215         if customizations is not None:
 216             document.edoc.getroot().set('customizations', u','.join(customizations))
 217
 218         # hack the tree
 219         #move_motifs_inside(document.edoc)
 220         #hack_motifs(document.edoc)
 221         parse_creator(document.edoc)
 222         if document.book_info.language == 'pol':
 223             substitute_hyphens(document.edoc)
 224         fix_hanging(document.edoc)
 225
 226         # wl -> TeXML
 227         style_filename = get_stylesheet("wl2tex")
 228         style = etree.parse(style_filename)
 229
 230         texml = document.transform(style)
 231
 232         # TeXML -> LaTeX
 233         temp = mkdtemp('-wl2pdf')
 234
 235         if cover:
 236             with open(os.path.join(temp, 'cover.jpg'), 'w') as f:
 237                 the_cover.save(f)
 238
 239         del document # no longer needed large object :)
 240
 241         tex_path = os.path.join(temp, 'doc.tex')
 242         fout = open(tex_path, 'w')
 243         process(StringIO(texml), fout, 'utf-8')
 244         fout.close()
 245         del texml
 246
 247         if save_tex:
 248             shutil.copy(tex_path, save_tex)
 249
 250         # LaTeX -> PDF
 251         shutil.copy(get_resource('pdf/wl.cls'), temp)
 252         shutil.copy(get_resource('res/wl-logo.png'), temp)
 253         shutil.copy('logo.eps', temp)
 254
 255         cwd = os.getcwd()
 256         os.chdir(temp)
 257
 258         if verbose:
 259             p = call(['xelatex', tex_path])
 260         else:
 261             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 262         if p:
 263             raise ParseError("Error parsing .tex file")
 264
 265         os.chdir(cwd)
 266
 267         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 268         pdf_path = os.path.join(temp, 'doc.pdf')
 269         shutil.move(pdf_path, output_file.name)
 270         shutil.rmtree(temp)
 271         return OutputFile.from_filename(output_file.name)
 272
 273     except (XMLSyntaxError, XSLTApplyError), e:
 274         raise ParseError(e)
 275
 276
 277 def load_including_children(wldoc=None, provider=None, uri=None):
 278     """ Makes one big xml file with children inserted at end.
 279
 280     Either wldoc or provider and URI must be provided.
 281     """
 282
 283     if uri and provider:
 284         f = provider.by_uri(uri)
 285         text = f.read().decode('utf-8')
 286         f.close()
 287     elif wldoc is not None:
 288         text = etree.tostring(wldoc.edoc, encoding=unicode)
 289         provider = wldoc.provider
 290     else:
 291         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 292
 293     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 294
 295     document = WLDocument.from_string(text, parse_dublincore=True)
 296     document.swap_endlines()
 297
 298     for child_uri in document.book_info.parts:
 299         child = load_including_children(provider=provider, uri=child_uri)
 300         document.edoc.getroot().append(child.edoc.getroot())
 301     return document