Merge remote-tracking branch 'upstream/master'
[librarian.git] / librarian / epub.py
index 10922d4..b17ca0e 100644 (file)
@@ -17,11 +17,12 @@ from tempfile import mkdtemp, NamedTemporaryFile
 from shutil import rmtree
 
 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
 from shutil import rmtree
 
 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
-from librarian.cover import WLCover
+from librarian.cover import DefaultEbookCover
 
 from librarian import functions, get_resource
 
 functions.reg_person_name()
 
 from librarian import functions, get_resource
 
 functions.reg_person_name()
+functions.reg_lang_code_3to2()
 
 
 def inner_xml(node):
 
 
 def inner_xml(node):
@@ -77,6 +78,7 @@ def replace_characters(node):
     def replace_chars(text):
         if text is None:
             return None
     def replace_chars(text):
         if text is None:
             return None
+        #text = re.sub(r"(?<=\s\w)\s+", u"\u00a0", text) #fix for hanging single letter conjunctions – for future use.
         return text.replace(u"\ufeff", u"")\
                    .replace("---", u"\u2014")\
                    .replace("--", u"\u2013")\
         return text.replace(u"\ufeff", u"")\
                    .replace("---", u"\u2014")\
                    .replace("--", u"\u2013")\
@@ -117,10 +119,10 @@ class Stanza(object):
     Slashes may only occur directly in the stanza. Any slashes in subelements
     will be ignored, and the subelements will be put inside verse elements.
 
     Slashes may only occur directly in the stanza. Any slashes in subelements
     will be ignored, and the subelements will be put inside verse elements.
 
-    >>> s = etree.fromstring("<strofa>a/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
+    >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
     >>> Stanza(s).versify()
     >>> print etree.tostring(s)
     >>> Stanza(s).versify()
     >>> print etree.tostring(s)
-    <strofa><wers_normalny>a</wers_normalny><wers_normalny>b<x>x/
+    <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
     
     """
     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
     
     """
@@ -149,16 +151,16 @@ class Stanza(object):
         return self.open_verse
 
     def push_text(self, text):
         return self.open_verse
 
     def push_text(self, text):
-        if not text or not text.strip():
+        if not text:
             return
         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
             if i:
                 self.open_normal_verse()
             verse = self.get_open_verse()
             if len(verse):
             return
         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
             if i:
                 self.open_normal_verse()
             verse = self.get_open_verse()
             if len(verse):
-                verse[-1].tail = (verse[-1].tail or "") + verse_text.strip()
+                verse[-1].tail = (verse[-1].tail or "") + verse_text
             else:
             else:
-                verse.text = (verse.text or "") + verse_text.strip()
+                verse.text = (verse.text or "") + verse_text
 
     def push_elem(self, elem):
         if elem.tag.startswith("wers"):
 
     def push_elem(self, elem):
         if elem.tag.startswith("wers"):
@@ -247,7 +249,7 @@ class TOC(object):
 
             nav_label = nav_map.makeelement(NCXNS('navLabel'))
             text = nav_map.makeelement(NCXNS('text'))
 
             nav_label = nav_map.makeelement(NCXNS('navLabel'))
             text = nav_map.makeelement(NCXNS('text'))
-            text.text = child.name
+            text.text = re.sub(r'\n', ' ', child.name)
             nav_label.append(text)
             nav_point.append(nav_label)
 
             nav_label.append(text)
             nav_point.append(nav_label)
 
@@ -290,18 +292,41 @@ def chop(main_text):
     main_xml_part = part_xml[0] # master
 
     last_node_part = False
     main_xml_part = part_xml[0] # master
 
     last_node_part = False
+    
+    # the below loop are workaround for a problem with epubs in drama ebooks without acts
+    is_scene = False
+    is_act = False
     for one_part in main_text:
         name = one_part.tag
     for one_part in main_text:
         name = one_part.tag
-        if name == 'naglowek_czesc':
-            yield part_xml
-            last_node_part = True
-            main_xml_part[:] = [deepcopy(one_part)]
-        elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
-            yield part_xml
-            main_xml_part[:] = [deepcopy(one_part)]
+        if name == 'naglowek_scena':
+            is_scene = True
+        elif name == 'naglowek_akt':
+            is_act = True
+    
+    for one_part in main_text:
+        name = one_part.tag
+        if is_act is False and is_scene is True:
+            if name == 'naglowek_czesc':
+                yield part_xml
+                last_node_part = True
+                main_xml_part[:] = [deepcopy(one_part)]
+            elif not last_node_part and name in ("naglowek_scena"):
+                yield part_xml
+                main_xml_part[:] = [deepcopy(one_part)]
+            else:
+                main_xml_part.append(deepcopy(one_part))
+                last_node_part = False
         else:
         else:
-            main_xml_part.append(deepcopy(one_part))
-            last_node_part = False
+            if name == 'naglowek_czesc':
+                yield part_xml
+                last_node_part = True
+                main_xml_part[:] = [deepcopy(one_part)]
+            elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
+                yield part_xml
+                main_xml_part[:] = [deepcopy(one_part)]
+            else:
+                main_xml_part.append(deepcopy(one_part))
+                last_node_part = False            
     yield part_xml
 
 
     yield part_xml
 
 
@@ -310,7 +335,9 @@ def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_s
 
     toc = TOC()
     for element in chunk_xml[0]:
 
     toc = TOC()
     for element in chunk_xml[0]:
-        if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
+        if element.tag in ("naglowek_czesc"):
+            toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
+        elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
             toc.add(node_name(element), "part%d.html" % chunk_no)
         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
             toc.add(node_name(element), "part%d.html" % chunk_no)
         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
             subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
@@ -336,7 +363,7 @@ def transform(wldoc, verbose=False,
 
     sample=n: generate sample e-book (with at least n paragraphs)
     cover: a cover.Cover factory or True for default
 
     sample=n: generate sample e-book (with at least n paragraphs)
     cover: a cover.Cover factory or True for default
-    flags: less-advertising, without-fonts, working-copy
+    flags: less-advertising, without-fonts, working-copy, with-full-fonts
     """
 
     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
     """
 
     def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
@@ -415,6 +442,11 @@ def transform(wldoc, verbose=False,
     # add editors info
     document.edoc.getroot().set('editors', u', '.join(sorted(
         editor.readable() for editor in document.editors())))
     # add editors info
     document.edoc.getroot().set('editors', u', '.join(sorted(
         editor.readable() for editor in document.editors())))
+    if document.book_info.funders:
+        document.edoc.getroot().set('funders', u', '.join(
+            document.book_info.funders))
+    if document.book_info.thanks:
+        document.edoc.getroot().set('thanks', document.book_info.thanks)
 
     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
     manifest = opf.find('.//' + OPFNS('manifest'))
 
     opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
     manifest = opf.find('.//' + OPFNS('manifest'))
@@ -443,7 +475,7 @@ def transform(wldoc, verbose=False,
 
     if cover:
         if cover is True:
 
     if cover:
         if cover is True:
-            cover = WLCover
+            cover = DefaultEbookCover
 
         cover_file = StringIO()
         bound_cover = cover(document.book_info)
 
         cover_file = StringIO()
         bound_cover = cover(document.book_info)
@@ -467,7 +499,7 @@ def transform(wldoc, verbose=False,
             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
         manifest.append(etree.fromstring(
             '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
             '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
         manifest.append(etree.fromstring(
             '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
-        spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
+        spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
 
         opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
         guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
 
@@ -506,6 +538,15 @@ def transform(wldoc, verbose=False,
         zip.writestr('OPS/annotations.html', etree.tostring(
                             html_tree, method="html", pretty_print=True))
 
         zip.writestr('OPS/annotations.html', etree.tostring(
                             html_tree, method="html", pretty_print=True))
 
+    toc.add("Wesprzyj Wolne Lektury", "support.html")
+    manifest.append(etree.fromstring(
+        '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
+    spine.append(etree.fromstring(
+        '<itemref idref="support" />'))
+    html_string = open(get_resource('epub/support.html')).read()
+    chars.update(used_chars(etree.fromstring(html_string)))
+    zip.writestr('OPS/support.html', html_string)
+
     toc.add("Strona redakcyjna", "last.html")
     manifest.append(etree.fromstring(
         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
     toc.add("Strona redakcyjna", "last.html")
     manifest.append(etree.fromstring(
         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
@@ -526,21 +567,23 @@ def transform(wldoc, verbose=False,
 
         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
 
         os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
         for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
-            optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
-                              get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
-            if verbose:
-                print "Running font-optimizer"
-                subprocess.check_call(optimizer_call)
+            if not flags or not 'with-full-fonts' in flags:
+                optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
+                              get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]              
+                if verbose:
+                    print "Running font-optimizer"
+                    subprocess.check_call(optimizer_call)
+                else:
+                    subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                    zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
             else:
             else:
-                subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
+                zip.write(get_resource('fonts/' + fname), os.path.join('OPS', fname))
             manifest.append(etree.fromstring(
             manifest.append(etree.fromstring(
-                '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
+                '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
         rmtree(tmpdir)
         if cwd is not None:
             os.chdir(cwd)
         rmtree(tmpdir)
         if cwd is not None:
             os.chdir(cwd)
-
-    zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
+    zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
     title = document.book_info.title
     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
     for st in attributes:
     title = document.book_info.title
     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
     for st in attributes:
@@ -548,7 +591,7 @@ def transform(wldoc, verbose=False,
         meta.set('name', st)
         meta.set('content', '0')
         toc_file[0].append(meta)
         meta.set('name', st)
         meta.set('content', '0')
         toc_file[0].append(meta)
-    toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
+    toc_file[0][0].set('content', str(document.book_info.url))
     toc_file[0][1].set('content', str(toc.depth()))
     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 
     toc_file[0][1].set('content', str(toc.depth()))
     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
 
@@ -557,7 +600,7 @@ def transform(wldoc, verbose=False,
         toc.add(u"Spis treści", "toc.html", index=1)
         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
     toc.write_to_xml(nav_map)
         toc.add(u"Spis treści", "toc.html", index=1)
         zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
     toc.write_to_xml(nav_map)
-    zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
+    zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
     zip.close()
 
     return OutputFile.from_filename(output_file.name)
     zip.close()
 
     return OutputFile.from_filename(output_file.name)