epub fix

[librarian.git] / librarian / pdf.py
diff --git a/librarian/pdf.py b/librarian/pdf.py

index 169d661..b9ead15 100644 (file)
--- a/librarian/pdf.py
+++ b/librarian/pdf.py
@@ -3,12 +3,17 @@
  # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
+from __future__ import with_statement
  import os
  import os.path
  import shutil
  from StringIO import StringIO
  from tempfile import mkdtemp
  import re
+from copy import deepcopy
+
+import sys
+sys.path.append('..') # for running from working copy
  
  from Texml.processor import process
  from lxml import etree
@@ -18,7 +23,13 @@ from librarian.parser import WLDocument
  from librarian import ParseError
  from librarian import functions
  
+
+
  functions.reg_substitute_entities()
+functions.reg_person_name()
+functions.reg_strip()
+functions.reg_starts_white()
+functions.reg_ends_white()
  
  STYLESHEETS = {
      'wl2tex': 'xslt/wl2tex.xslt',
@@ -26,7 +37,14 @@ STYLESHEETS = {
  
  
  def insert_tags(doc, split_re, tagname):
-    print tagname
+    """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree 
+
+    >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
+    >>> insert_tags(t, re.compile('-'), 'd');
+    >>> print etree.tostring(t)
+    <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
+    """
+
      for elem in doc.iter():
          if elem.text:
              chunks = split_re.split(elem.text)
@@ -42,7 +60,7 @@ def insert_tags(doc, split_re, tagname):
              elem.tail = chunks.pop(0)
              while chunks:
                  ins = etree.Element(tagname)
-                ins.tail = chunks.pop(0)
+                ins.tail = chunks.pop()
                  parent.insert(ins_index, ins)
  
  
@@ -58,8 +76,11 @@ def fix_hanging(doc):
                  "nbsp")
  
  
+def get_resource(path):
+    return os.path.join(os.path.dirname(__file__), path)
+
  def get_stylesheet(name):
-    return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
+    return get_resource(STYLESHEETS[name])
  
  def transform(provider, slug, output_file=None, output_dir=None):
      """ produces a pdf file
@@ -76,10 +97,34 @@ def transform(provider, slug, output_file=None, output_dir=None):
  
          document = load_including_children(provider, slug)
  
+        # dirty hack for the marginpar-creates-orphans LaTeX problem
+        # see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
+        for motif in document.edoc.findall('//strofa//motyw'):
+            # find relevant verse-level tag
+            verse, stanza = motif, motif.getparent()
+            while stanza is not None and stanza.tag != 'strofa':
+                verse, stanza = stanza, stanza.getparent()
+            breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
+            breaks_after = sum(1 for i in verse.itersiblings('br'))
+            if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
+                move_by = 1
+                if breaks_after == 2:
+                    move_by += 1
+                moved_motif = deepcopy(motif)
+                motif.tag = 'span'
+                motif.text = None
+                moved_motif.tail = None
+                moved_motif.set('moved', str(move_by))
+
+                for br in verse.itersiblings(tag='br'):
+                    if move_by > 1:
+                        move_by -= 1
+                        continue
+                    br.addnext(moved_motif)
+                    break
+
          substitute_hyphens(document.edoc)
          fix_hanging(document.edoc)
-        
-        print etree.tostring(document.edoc)
  
          # if output to dir, create the file
          if output_dir is not None:
@@ -96,6 +141,8 @@ def transform(provider, slug, output_file=None, output_dir=None):
          fout.close()
          del texml
  
+        shutil.copy(get_resource('pdf/wl.sty'), temp)
+        shutil.copy(get_resource('pdf/wl-logo.png'), temp)
          print "pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))
          if os.system("pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))):
              raise ParseError("Error parsing .tex file")