librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7 import os
   8 import os.path
   9 import shutil
  10 from StringIO import StringIO
  11 from tempfile import mkdtemp
  12 import re
  13 from copy import deepcopy
  14 from subprocess import call, PIPE
  15
  16 import sys
  17
  18 from Texml.processor import process
  19 from lxml import etree
  20 from lxml.etree import XMLSyntaxError, XSLTApplyError
  21
  22 from librarian.dcparser import Person
  23 from librarian.parser import WLDocument
  24 from librarian import ParseError, DCNS, get_resource
  25 from librarian import functions
  26
  27
  28 functions.reg_substitute_entities()
  29 functions.reg_strip()
  30 functions.reg_starts_white()
  31 functions.reg_ends_white()
  32 functions.reg_texcommand()
  33
  34 STYLESHEETS = {
  35     'wl2tex': 'pdf/wl2tex.xslt',
  36 }
  37
  38
  39 def insert_tags(doc, split_re, tagname, exclude=None):
  40     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  41
  42     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  43     >>> insert_tags(t, re.compile('-'), 'd');
  44     >>> print etree.tostring(t)
  45     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  46     """
  47
  48     for elem in doc.iter(tag=etree.Element):
  49         if exclude and elem.tag in exclude:
  50             continue
  51         if elem.text:
  52             chunks = split_re.split(elem.text)
  53             while len(chunks) > 1:
  54                 ins = etree.Element(tagname)
  55                 ins.tail = chunks.pop()
  56                 elem.insert(0, ins)
  57             elem.text = chunks.pop(0)
  58         if elem.tail:
  59             chunks = split_re.split(elem.tail)
  60             parent = elem.getparent()
  61             ins_index = parent.index(elem) + 1
  62             while len(chunks) > 1:
  63                 ins = etree.Element(tagname)
  64                 ins.tail = chunks.pop()
  65                 parent.insert(ins_index, ins)
  66             elem.tail = chunks.pop(0)
  67
  68
  69 def substitute_hyphens(doc):
  70     insert_tags(doc,
  71                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  72                 "dywiz",
  73                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  74                 )
  75
  76
  77 def fix_hanging(doc):
  78     insert_tags(doc,
  79                 re.compile("(?<=\s\w)\s+"),
  80                 "nbsp",
  81                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  82                 )
  83
  84
  85 def move_motifs_inside(doc):
  86     """ moves motifs to be into block elements """
  87     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
  88         for motif in master.xpath('motyw'):
  89             for sib in motif.itersiblings():
  90                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
  91                     # motif shouldn't have a tail - it would be untagged text
  92                     motif.tail = None
  93                     motif.getparent().remove(motif)
  94                     sib.insert(0, motif)
  95                     break
  96
  97
  98 def hack_motifs(doc):
  99     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 100     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 101
 102     moves motifs in stanzas from first verse to second
 103     and from next to last to last, then inserts negative vspace before them
 104     """
 105     for motif in doc.findall('//strofa//motyw'):
 106         # find relevant verse-level tag
 107         verse, stanza = motif, motif.getparent()
 108         while stanza is not None and stanza.tag != 'strofa':
 109             verse, stanza = stanza, stanza.getparent()
 110         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 111         breaks_after = sum(1 for i in verse.itersiblings('br'))
 112         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 113             move_by = 1
 114             if breaks_after == 2:
 115                 move_by += 1
 116             moved_motif = deepcopy(motif)
 117             motif.tag = 'span'
 118             motif.text = None
 119             moved_motif.tail = None
 120             moved_motif.set('moved', str(move_by))
 121
 122             for br in verse.itersiblings('br'):
 123                 if move_by > 1:
 124                     move_by -= 1
 125                     continue
 126                 br.addnext(moved_motif)
 127                 break
 128
 129
 130 def parse_creator(doc):
 131     """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
 132     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 133                     'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
 134                     namespaces = {'dc': str(DCNS)})[::-1]:
 135         if not person.text:
 136             continue
 137         p = Person.from_text(person.text)
 138         person_parsed = deepcopy(person)
 139         person_parsed.tag = person.tag + '_parsed'
 140         person_parsed.set('sortkey', person.text)
 141         person_parsed.text = p.readable()
 142         person.getparent().insert(0, person_parsed)
 143
 144
 145 def get_stylesheet(name):
 146     return get_resource(STYLESHEETS[name])
 147
 148
 149 def package_available(package, args='', verbose=False):
 150     """ check if a verion of a latex package accepting given args is available """
 151     tempdir = mkdtemp('-wl2pdf-test')
 152     fpath = os.path.join(tempdir, 'test.tex')
 153     f = open(fpath, 'w')
 154     f.write(r"""
 155         \documentclass{book}
 156         \usepackage[%s]{%s}
 157         \begin{document}
 158         \end{document}
 159         """ % (args, package))
 160     f.close()
 161     if verbose:
 162         p = call(['xelatex', '-output-directory', tempdir, fpath])
 163     else:
 164         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 165     shutil.rmtree(tempdir)
 166     return p == 0
 167
 168
 169 def transform(provider, slug=None, file_path=None,
 170               output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None, morefloats=None,
 171               cover=None, flags=None):
 172     """ produces a PDF file with XeLaTeX
 173
 174     provider: a DocProvider
 175     slug: slug of file to process, available by provider
 176     file_path can be provided instead of a slug
 177     output_file: file-like object or path to output file
 178     output_dir: path to directory to save output file to; either this or output_file must be present
 179     make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
 180     verbose: prints all output from LaTeX
 181     save_tex: path to save the intermediary LaTeX file to
 182     morefloats (old/new/none): force specific morefloats
 183     cover: a cover.Cover object
 184     flags: less-advertising,
 185     """
 186
 187     # Parse XSLT
 188     try:
 189         if file_path:
 190             if slug:
 191                 raise ValueError('slug or file_path should be specified, not both')
 192             document = load_including_children(provider, file_path=file_path)
 193         else:
 194             if not slug:
 195                 raise ValueError('either slug or file_path should be specified')
 196             document = load_including_children(provider, slug=slug)
 197
 198         if cover:
 199             document.edoc.getroot().set('data-cover-width', str(cover.width))
 200             document.edoc.getroot().set('data-cover-height', str(cover.height))
 201         if flags:
 202             for flag in flags:
 203                 document.edoc.getroot().set('flag-' + flag, 'yes')
 204
 205         # check for LaTeX packages
 206         if morefloats:
 207             document.edoc.getroot().set('morefloats', morefloats.lower())
 208         elif package_available('morefloats', 'maxfloats=19'):
 209             document.edoc.getroot().set('morefloats', 'new')
 210
 211         # hack the tree
 212         move_motifs_inside(document.edoc)
 213         hack_motifs(document.edoc)
 214         parse_creator(document.edoc)
 215         substitute_hyphens(document.edoc)
 216         fix_hanging(document.edoc)
 217
 218         # find output dir
 219         if make_dir and output_dir is not None:
 220             author = unicode(document.book_info.author)
 221             output_dir = os.path.join(output_dir, author)
 222
 223         # wl -> TeXML
 224         style_filename = get_stylesheet("wl2tex")
 225         style = etree.parse(style_filename)
 226         texml = document.transform(style)
 227
 228         # TeXML -> LaTeX
 229         temp = mkdtemp('-wl2pdf')
 230
 231         if cover:
 232             c = cover(document.book_info.author.readable(), document.book_info.title)
 233             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 234                 c.save(f)
 235
 236         del document # no longer needed large object :)
 237
 238         tex_path = os.path.join(temp, 'doc.tex')
 239         fout = open(tex_path, 'w')
 240         process(StringIO(texml), fout, 'utf-8')
 241         fout.close()
 242         del texml
 243
 244         if save_tex:
 245             shutil.copy(tex_path, save_tex)
 246
 247         # LaTeX -> PDF
 248         shutil.copy(get_resource('pdf/wl.sty'), temp)
 249         shutil.copy(get_resource('res/wl-logo.png'), temp)
 250
 251         cwd = os.getcwd()
 252         os.chdir(temp)
 253
 254         if verbose:
 255             p = call(['xelatex', tex_path])
 256         else:
 257             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 258         if p:
 259             raise ParseError("Error parsing .tex file")
 260
 261         os.chdir(cwd)
 262
 263         # save the PDF
 264         pdf_path = os.path.join(temp, 'doc.pdf')
 265         if output_dir is not None:
 266             try:
 267                 os.makedirs(output_dir)
 268             except OSError:
 269                 pass
 270             if slug:
 271                 output_path = os.path.join(output_dir, '%s.pdf' % slug)
 272             else:
 273                 output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
 274             shutil.move(pdf_path, output_path)
 275         else:
 276             if hasattr(output_file, 'write'):
 277                 # file-like object
 278                 with open(pdf_path) as f:
 279                     output_file.write(f.read())
 280                 output_file.close()
 281             else:
 282                 # path to output file
 283                 shutil.copy(pdf_path, output_file)
 284         shutil.rmtree(temp)
 285
 286     except (XMLSyntaxError, XSLTApplyError), e:
 287         raise ParseError(e)
 288
 289
 290 def load_including_children(provider, slug=None, uri=None, file_path=None):
 291     """ makes one big xml file with children inserted at end
 292     either slug or uri must be provided
 293     """
 294
 295     if uri:
 296         f = provider.by_uri(uri)
 297     elif slug:
 298         f = provider[slug]
 299     elif file_path:
 300         f = open(file_path, 'r')
 301     else:
 302         raise ValueError('Neither slug, URI nor file path provided for a book.')
 303
 304     text = f.read().decode('utf-8')
 305     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 306
 307     document = WLDocument.from_string(text, True,
 308         parse_dublincore=True)
 309
 310     f.close()
 311     for child_uri in document.book_info.parts:
 312         print child_uri
 313         child = load_including_children(provider, uri=child_uri)
 314         document.edoc.getroot().append(child.edoc.getroot())
 315     return document