Fixes #3952.

[librarian.git] / librarian / pdf.py
diff --git a/librarian/pdf.py b/librarian/pdf.py

index 3c83cad..e6d897d 100644 (file)
--- a/librarian/pdf.py
+++ b/librarian/pdf.py
@@ -3,25 +3,34 @@
  # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
  # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
-from __future__ import with_statement
+"""PDF creation library.
+
+Creates one big XML from the book and its children, converts it to LaTeX
+with TeXML, then runs it by XeLaTeX.
+
+"""
+from __future__ import print_function, unicode_literals
+
  import os
  import os.path
  import shutil
  import os
  import os.path
  import shutil
-from StringIO import StringIO
  from tempfile import mkdtemp, NamedTemporaryFile
  import re
  from copy import deepcopy
  from subprocess import call, PIPE
  from tempfile import mkdtemp, NamedTemporaryFile
  import re
  from copy import deepcopy
  from subprocess import call, PIPE
+from itertools import chain
  
  from Texml.processor import process
  from lxml import etree
  from lxml.etree import XMLSyntaxError, XSLTApplyError
  
  from Texml.processor import process
  from lxml import etree
  from lxml.etree import XMLSyntaxError, XSLTApplyError
+import six
  
  from librarian.dcparser import Person
  from librarian.parser import WLDocument
  
  from librarian.dcparser import Person
  from librarian.parser import WLDocument
-from librarian import ParseError, DCNS, get_resource, OutputFile
+from librarian import ParseError, DCNS, get_resource, OutputFile, RDFNS
  from librarian import functions
  from librarian import functions
-from librarian.cover import WLCover
+from librarian.cover import make_cover
+from .sponsor import sponsor_logo
  
  
  functions.reg_substitute_entities()
  
  
  functions.reg_substitute_entities()
@@ -34,21 +43,22 @@ STYLESHEETS = {
      'wl2tex': 'pdf/wl2tex.xslt',
  }
  
      'wl2tex': 'pdf/wl2tex.xslt',
  }
  
-#CUSTOMIZATIONS = [
-#    'nofootnotes',
-#    'nothemes',
-#    'defaultleading',
-#    'onehalfleading',
-#    'doubleleading',
-#    'nowlfont',
-#    ]
+# CUSTOMIZATIONS = [
+#     'nofootnotes',
+#     'nothemes',
+#     'defaultleading',
+#     'onehalfleading',
+#     'doubleleading',
+#     'nowlfont',
+# ]
+
  
  def insert_tags(doc, split_re, tagname, exclude=None):
      """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  
  
  def insert_tags(doc, split_re, tagname, exclude=None):
      """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  
-    >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
-    >>> insert_tags(t, re.compile('-'), 'd');
-    >>> print etree.tostring(t)
+    >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
+    >>> insert_tags(t, re.compile('-'), 'd')
+    >>> print(etree.tostring(t, encoding='unicode'))
      <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
      """
  
      <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
      """
  
@@ -77,7 +87,7 @@ def substitute_hyphens(doc):
      insert_tags(doc,
                  re.compile("(?<=[^-\s])-(?=[^-\s])"),
                  "dywiz",
      insert_tags(doc,
                  re.compile("(?<=[^-\s])-(?=[^-\s])"),
                  "dywiz",
-                exclude=[DCNS("identifier.url"), DCNS("rights.license")]
+                exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"]
                  )
  
  
                  )
  
  
@@ -89,12 +99,38 @@ def fix_hanging(doc):
                  )
  
  
                  )
  
  
+def fix_tables(doc):
+    for kol in doc.iter(tag='kol'):
+        if kol.tail is not None:
+            if not kol.tail.strip():
+                kol.tail = None
+    for table in chain(doc.iter(tag='tabela'), doc.iter(tag='tabelka')):
+        if table.get('ramka') == '1' or table.get('ramki') == '1':
+            table.set('_format', '|' + 'X|' * len(table[0]))
+        else:
+            table.set('_format', 'X' * len(table[0]))
+
+
+def mark_subauthors(doc):
+    root_author = ', '.join(elem.text for elem in doc.findall('./' + RDFNS('RDF') + '//' + DCNS('creator_parsed')))
+    last_author = None
+    # jeśli autor jest inny niż autor całości i niż poprzedni autor
+    # to wstawiamy jakiś znacznik w rdf?
+    for subutwor in doc.xpath('/utwor/utwor'):
+        author = ', '.join(elem.text for elem in subutwor.findall('.//' + DCNS('creator_parsed')))
+        if author not in (last_author, root_author):
+            subutwor.find('.//' + RDFNS('RDF')).append(etree.Element('use_subauthor'))
+        last_author = author
+
+
  def move_motifs_inside(doc):
      """ moves motifs to be into block elements """
  def move_motifs_inside(doc):
      """ moves motifs to be into block elements """
-    for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
+    for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
+                            '//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
          for motif in master.xpath('motyw'):
              for sib in motif.itersiblings():
          for motif in master.xpath('motyw'):
              for sib in motif.itersiblings():
-                if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
+                if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia',
+                                   'begin', 'end', 'motyw', 'extra', 'uwaga'):
                      # motif shouldn't have a tail - it would be untagged text
                      motif.tail = None
                      motif.getparent().remove(motif)
                      # motif shouldn't have a tail - it would be untagged text
                      motif.tail = None
                      motif.getparent().remove(motif)
@@ -135,10 +171,13 @@ def hack_motifs(doc):
  
  
  def parse_creator(doc):
  
  
  def parse_creator(doc):
-    """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
-    for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
-                    'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
-                    namespaces = {'dc': str(DCNS)})[::-1]:
+    """Generates readable versions of creator and translator tags.
+
+    Finds all dc:creator and dc.contributor.translator tags
+    and adds *_parsed versions with forenames first.
+    """
+    for person in doc.xpath("|".join('//dc:' + tag for tag in ('creator', 'contributor.translator')),
+                            namespaces={'dc': str(DCNS)})[::-1]:
          if not person.text:
              continue
          p = Person.from_text(person.text)
          if not person.text:
              continue
          p = Person.from_text(person.text)
@@ -158,11 +197,11 @@ def package_available(package, args='', verbose=False):
      tempdir = mkdtemp('-wl2pdf-test')
      fpath = os.path.join(tempdir, 'test.tex')
      f = open(fpath, 'w')
      tempdir = mkdtemp('-wl2pdf-test')
      fpath = os.path.join(tempdir, 'test.tex')
      f = open(fpath, 'w')
-    f.write(r"""
-        \documentclass{wl}
-        \usepackage[%s]{%s}
-        \begin{document}
-        \end{document}
+    f.write("""
+        \\documentclass{wl}
+        \\usepackage[%s]{%s}
+        \\begin{document}
+        \\end{document}
          """ % (args, package))
      f.close()
      if verbose:
          """ % (args, package))
      f.close()
      if verbose:
@@ -174,7 +213,7 @@ def package_available(package, args='', verbose=False):
  
  
  def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
  
  
  def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
-              cover=None, flags=None, customizations=None):
+              cover=None, flags=None, customizations=None, ilustr_path='', latex_dir=False):
      """ produces a PDF file with XeLaTeX
  
      wldoc: a WLDocument
      """ produces a PDF file with XeLaTeX
  
      wldoc: a WLDocument
@@ -188,32 +227,44 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
  
      # Parse XSLT
      try:
  
      # Parse XSLT
      try:
+        book_info = wldoc.book_info
          document = load_including_children(wldoc)
          document = load_including_children(wldoc)
+        root = document.edoc.getroot()
  
          if cover:
              if cover is True:
  
          if cover:
              if cover is True:
-                cover = WLCover
-            bound_cover = cover(document.book_info)
-            document.edoc.getroot().set('data-cover-width', str(bound_cover.width))
-            document.edoc.getroot().set('data-cover-height', str(bound_cover.height))
+                cover = make_cover
+            bound_cover = cover(book_info, width=1200)
+            root.set('data-cover-width', str(bound_cover.width))
+            root.set('data-cover-height', str(bound_cover.height))
              if bound_cover.uses_dc_cover:
              if bound_cover.uses_dc_cover:
-                if document.book_info.cover_by:
-                    document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
-                if document.book_info.cover_source:
-                    document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
+                if book_info.cover_by:
+                    root.set('data-cover-by', book_info.cover_by)
+                if book_info.cover_source:
+                    root.set('data-cover-source', book_info.cover_source)
          if flags:
              for flag in flags:
          if flags:
              for flag in flags:
-                document.edoc.getroot().set('flag-' + flag, 'yes')
+                root.set('flag-' + flag, 'yes')
  
          # check for LaTeX packages
          if morefloats:
  
          # check for LaTeX packages
          if morefloats:
-            document.edoc.getroot().set('morefloats', morefloats.lower())
+            root.set('morefloats', morefloats.lower())
          elif package_available('morefloats', 'maxfloats=19'):
          elif package_available('morefloats', 'maxfloats=19'):
-            document.edoc.getroot().set('morefloats', 'new')
+            root.set('morefloats', 'new')
  
          # add customizations
          if customizations is not None:
  
          # add customizations
          if customizations is not None:
-            document.edoc.getroot().set('customizations', u','.join(customizations))
+            root.set('customizations', u','.join(customizations))
+
+        # add editors info
+        editors = document.editors()
+        if editors:
+            root.set('editors', u', '.join(sorted(
+                editor.readable() for editor in editors)))
+        if document.book_info.funders:
+            root.set('funders', u', '.join(document.book_info.funders))
+        if document.book_info.thanks:
+            root.set('thanks', document.book_info.thanks)
  
          # hack the tree
          move_motifs_inside(document.edoc)
  
          # hack the tree
          move_motifs_inside(document.edoc)
@@ -221,25 +272,43 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
          parse_creator(document.edoc)
          substitute_hyphens(document.edoc)
          fix_hanging(document.edoc)
          parse_creator(document.edoc)
          substitute_hyphens(document.edoc)
          fix_hanging(document.edoc)
+        fix_tables(document.edoc)
+        mark_subauthors(document.edoc)
  
          # wl -> TeXML
          style_filename = get_stylesheet("wl2tex")
          style = etree.parse(style_filename)
  
          # wl -> TeXML
          style_filename = get_stylesheet("wl2tex")
          style = etree.parse(style_filename)
-
-        texml = document.transform(style)
+        functions.reg_mathml_latex()
  
          # TeXML -> LaTeX
          temp = mkdtemp('-wl2pdf')
  
  
          # TeXML -> LaTeX
          temp = mkdtemp('-wl2pdf')
  
+        for ilustr in document.edoc.findall("//ilustr"):
+            shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)
+
+        for sponsor in book_info.sponsors:
+            ins = etree.Element("data-sponsor", name=sponsor)
+            logo = sponsor_logo(sponsor)
+            if logo:
+                fname = 'sponsor-%s' % os.path.basename(logo)
+                shutil.copy(logo, os.path.join(temp, fname))
+                ins.set('src', fname)
+            root.insert(0, ins)
+                
+        if book_info.sponsor_note:
+            root.set("sponsor-note", book_info.sponsor_note)
+
+        texml = document.transform(style)
+
          if cover:
              with open(os.path.join(temp, 'cover.png'), 'w') as f:
          if cover:
              with open(os.path.join(temp, 'cover.png'), 'w') as f:
-                bound_cover.save(f)
+                bound_cover.save(f, quality=80)
  
  
-        del document # no longer needed large object :)
+        del document  # no longer needed large object :)
  
          tex_path = os.path.join(temp, 'doc.tex')
  
          tex_path = os.path.join(temp, 'doc.tex')
-        fout = open(tex_path, 'w')
-        process(StringIO(texml), fout, 'utf-8')
+        fout = open(tex_path, 'wb')
+        process(six.BytesIO(texml), fout, 'utf-8')
          fout.close()
          del texml
  
          fout.close()
          del texml
  
@@ -250,18 +319,24 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
          shutil.copy(get_resource('pdf/wl.cls'), temp)
          shutil.copy(get_resource('res/wl-logo.png'), temp)
  
          shutil.copy(get_resource('pdf/wl.cls'), temp)
          shutil.copy(get_resource('res/wl-logo.png'), temp)
  
+        if latex_dir:
+            return temp
+
          try:
              cwd = os.getcwd()
          except OSError:
              cwd = None
          os.chdir(temp)
  
          try:
              cwd = os.getcwd()
          except OSError:
              cwd = None
          os.chdir(temp)
  
-        if verbose:
-            p = call(['xelatex', tex_path])
-        else:
-            p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
-        if p:
-            raise ParseError("Error parsing .tex file")
+        # some things work better when compiled twice
+        # (table of contents, [line numbers - disabled])
+        for run in range(2):
+            if verbose:
+                p = call(['xelatex', tex_path])
+            else:
+                p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
+            if p:
+                raise ParseError("Error parsing .tex file")
  
          if cwd is not None:
              os.chdir(cwd)
  
          if cwd is not None:
              os.chdir(cwd)
@@ -272,7 +347,7 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
          shutil.rmtree(temp)
          return OutputFile.from_filename(output_file.name)
  
          shutil.rmtree(temp)
          return OutputFile.from_filename(output_file.name)
  
-    except (XMLSyntaxError, XSLTApplyError), e:
+    except (XMLSyntaxError, XSLTApplyError) as e:
          raise ParseError(e)
  
  
          raise ParseError(e)
  
  
@@ -287,14 +362,14 @@ def load_including_children(wldoc=None, provider=None, uri=None):
          text = f.read().decode('utf-8')
          f.close()
      elif wldoc is not None:
          text = f.read().decode('utf-8')
          f.close()
      elif wldoc is not None:
-        text = etree.tostring(wldoc.edoc, encoding=unicode)
+        text = etree.tostring(wldoc.edoc, encoding='unicode')
          provider = wldoc.provider
      else:
          raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
  
          provider = wldoc.provider
      else:
          raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
  
-    text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
+    text = re.sub(r"([\u0400-\u04ff]+)", r"<alien>\1</alien>", text)
  
  
-    document = WLDocument.from_string(text, parse_dublincore=True)
+    document = WLDocument.from_bytes(text.encode('utf-8'), parse_dublincore=True, provider=provider)
      document.swap_endlines()
  
      for child_uri in document.book_info.parts:
      document.swap_endlines()
  
      for child_uri in document.book_info.parts: