epub fix

[librarian.git] / librarian / pdf.py
diff --git a/librarian/pdf.py b/librarian/pdf.py

index 169d661..b9ead15 100644 (file)
--- a/librarian/pdf.py
+++ b/librarian/pdf.py
@@ -3,12 +3,17 @@
  # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
  # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
+from __future__ import with_statement
  import os
  import os.path
  import shutil
  from StringIO import StringIO
  from tempfile import mkdtemp
  import re
  import os
  import os.path
  import shutil
  from StringIO import StringIO
  from tempfile import mkdtemp
  import re
+from copy import deepcopy
+
+import sys
+sys.path.append('..') # for running from working copy
  
  from Texml.processor import process
  from lxml import etree
  
  from Texml.processor import process
  from lxml import etree
@@ -18,7 +23,13 @@ from librarian.parser import WLDocument
  from librarian import ParseError
  from librarian import functions
  
  from librarian import ParseError
  from librarian import functions
  
+
+
  functions.reg_substitute_entities()
  functions.reg_substitute_entities()
+functions.reg_person_name()
+functions.reg_strip()
+functions.reg_starts_white()
+functions.reg_ends_white()
  
  STYLESHEETS = {
      'wl2tex': 'xslt/wl2tex.xslt',
  
  STYLESHEETS = {
      'wl2tex': 'xslt/wl2tex.xslt',
@@ -26,7 +37,14 @@ STYLESHEETS = {
  
  
  def insert_tags(doc, split_re, tagname):
  
  
  def insert_tags(doc, split_re, tagname):
-    print tagname
+    """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree 
+
+    >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
+    >>> insert_tags(t, re.compile('-'), 'd');
+    >>> print etree.tostring(t)
+    <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
+    """
+
      for elem in doc.iter():
          if elem.text:
              chunks = split_re.split(elem.text)
      for elem in doc.iter():
          if elem.text:
              chunks = split_re.split(elem.text)
@@ -42,7 +60,7 @@ def insert_tags(doc, split_re, tagname):
              elem.tail = chunks.pop(0)
              while chunks:
                  ins = etree.Element(tagname)
              elem.tail = chunks.pop(0)
              while chunks:
                  ins = etree.Element(tagname)
-                ins.tail = chunks.pop(0)
+                ins.tail = chunks.pop()
                  parent.insert(ins_index, ins)
  
  
                  parent.insert(ins_index, ins)
  
  
@@ -58,8 +76,11 @@ def fix_hanging(doc):
                  "nbsp")
  
  
                  "nbsp")
  
  
+def get_resource(path):
+    return os.path.join(os.path.dirname(__file__), path)
+
  def get_stylesheet(name):
  def get_stylesheet(name):
-    return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
+    return get_resource(STYLESHEETS[name])
  
  def transform(provider, slug, output_file=None, output_dir=None):
      """ produces a pdf file
  
  def transform(provider, slug, output_file=None, output_dir=None):
      """ produces a pdf file
@@ -76,10 +97,34 @@ def transform(provider, slug, output_file=None, output_dir=None):
  
          document = load_including_children(provider, slug)
  
  
          document = load_including_children(provider, slug)
  
+        # dirty hack for the marginpar-creates-orphans LaTeX problem
+        # see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
+        for motif in document.edoc.findall('//strofa//motyw'):
+            # find relevant verse-level tag
+            verse, stanza = motif, motif.getparent()
+            while stanza is not None and stanza.tag != 'strofa':
+                verse, stanza = stanza, stanza.getparent()
+            breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
+            breaks_after = sum(1 for i in verse.itersiblings('br'))
+            if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
+                move_by = 1
+                if breaks_after == 2:
+                    move_by += 1
+                moved_motif = deepcopy(motif)
+                motif.tag = 'span'
+                motif.text = None
+                moved_motif.tail = None
+                moved_motif.set('moved', str(move_by))
+
+                for br in verse.itersiblings(tag='br'):
+                    if move_by > 1:
+                        move_by -= 1
+                        continue
+                    br.addnext(moved_motif)
+                    break
+
          substitute_hyphens(document.edoc)
          fix_hanging(document.edoc)
          substitute_hyphens(document.edoc)
          fix_hanging(document.edoc)
-        
-        print etree.tostring(document.edoc)
  
          # if output to dir, create the file
          if output_dir is not None:
  
          # if output to dir, create the file
          if output_dir is not None:
@@ -96,6 +141,8 @@ def transform(provider, slug, output_file=None, output_dir=None):
          fout.close()
          del texml
  
          fout.close()
          del texml
  
+        shutil.copy(get_resource('pdf/wl.sty'), temp)
+        shutil.copy(get_resource('pdf/wl-logo.png'), temp)
          print "pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))
          if os.system("pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))):
              raise ParseError("Error parsing .tex file")
          print "pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))
          if os.system("pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))):
              raise ParseError("Error parsing .tex file")