New mobi builder.

author Radek Czajka <rczajka@rczajka.pl>

Mon, 20 Dec 2021 10:50:38 +0000 (11:50 +0100)

committer Radek Czajka <rczajka@rczajka.pl>

Mon, 20 Dec 2021 10:50:38 +0000 (11:50 +0100)
author Radek Czajka <rczajka@rczajka.pl>
Mon, 20 Dec 2021 10:50:38 +0000 (11:50 +0100)
committer Radek Czajka <rczajka@rczajka.pl>
Mon, 20 Dec 2021 10:50:38 +0000 (11:50 +0100)
diff --git a/src/librarian/builders/__init__.py b/src/librarian/builders/__init__.py

index e359cd6..d8acb82 100644 (file)
--- a/src/librarian/builders/__init__.py
+++ b/src/librarian/builders/__init__.py
@@ -4,6 +4,7 @@ from .html import HtmlBuilder, StandaloneHtmlBuilder, DaisyHtmlBuilder
  from .sanitize import Sanitizer
  from .daisy import DaisyBuilder
  from .epub import EpubBuilder
+from .mobi import MobiBuilder
  from .pdf import PdfBuilder
  
  
@@ -16,5 +17,6 @@ builders = OrderedDict([
      ("sanitizer", Sanitizer),
  
      ("epub", EpubBuilder),
+    ("mobi", MobiBuilder),
      ("pdf", PdfBuilder),
  ])
diff --git a/src/librarian/builders/epub.py b/src/librarian/builders/epub.py

index 4471e30..401136f 100644 (file)
--- a/src/librarian/builders/epub.py
+++ b/src/librarian/builders/epub.py
@@ -29,10 +29,11 @@ class Xhtml:
  class Builder:
      file_extension = None
  
-    def __init__(self, base_url=None, fundraising=None):
+    def __init__(self, base_url=None, fundraising=None, cover=None):
          self._base_url = base_url or 'file:///home/rczajka/for/fnp/librarian/temp~/maly/img/'
          self.fundraising = fundraising
          self.footnotes = etree.Element('div', id='footnotes')
+        self.make_cover = cover or make_cover
  
          self.cursors = {
  #            None: None,
@@ -78,6 +79,7 @@ class Builder:
  
  class EpubBuilder(Builder):
      file_extension = 'epub'
+    isbn_field = 'isbn_epub'
  
      def __init__(self, *args, **kwargs):
          self.chars = set()
@@ -109,7 +111,6 @@ class EpubBuilder(Builder):
  
          self.set_metadata()
          
-
          self.add_cover()
          
          self.add_title_page()
@@ -212,10 +213,10 @@ class EpubBuilder(Builder):
  
          e = self.document.tree.find('//autor_utworu')
          if e is not None:
-            etree.SubElement(tp, 'h2', **{'class': 'author'}).text = e.raw_printable_text()
+            etree.SubElement(tp, 'h2', **{'class': 'author'}).text = e.raw_printable_text(self)
          e = self.document.tree.find('//nazwa_utworu')
          if e is not None:
-            etree.SubElement(tp, 'h1', **{'class': 'title'}).text = e.raw_printable_text()
+            etree.SubElement(tp, 'h1', **{'class': 'title'}).text = e.raw_printable_text(self)
  
          if not len(tp):
              for author in self.document.meta.authors:
@@ -251,8 +252,8 @@ class EpubBuilder(Builder):
            </p>
          """))
  
-        if self.document.meta.isbn_epub:
-            etree.SubElement(tp, 'p', **{"class": "info"}).text = self.document.meta.isbn_epub
+        if getattr(self.document.meta, self.isbn_field):
+            etree.SubElement(tp, 'p', **{"class": "info"}).text = getattr(self.document.meta, self.isbn_field)
  
          tp.append(etree.XML("""<p class="footer info">
              <a href="http://www.wolnelektury.pl/"><img src="logo_wolnelektury.png" alt="WolneLektury.pl" /></a>
@@ -589,8 +590,8 @@ class EpubBuilder(Builder):
              else:
                  p.text += m.cover_by
              
-        if m.isbn_epub:
-            newp().text = m.isbn_epub
+        if getattr(m, self.isbn_field):
+            newp().text = getattr(m, self.isbn_field)
  
          newp().text = '\u00a0'
  
@@ -644,10 +645,10 @@ class EpubBuilder(Builder):
      def add_cover(self):
          # TODO: allow other covers
  
-        cover_maker = make_cover
+        cover_maker = self.make_cover
  
          cover_file = six.BytesIO()
-        cover = cover_maker(self.document.meta)
+        cover = cover_maker(self.document.meta, width=600)
          cover.save(cover_file)
          cover_name = 'cover.%s' % cover.ext()
  
diff --git a/src/librarian/builders/mobi.py b/src/librarian/builders/mobi.py

new file mode 100644 (file)

index 0000000..19b5036
--- /dev/null
+++ b/src/librarian/builders/mobi.py
@@ -0,0 +1,45 @@
+import os
+import six
+import subprocess
+from tempfile import NamedTemporaryFile
+from librarian import functions, get_resource, OutputFile
+from librarian.hyphenator import Hyphenator
+from .epub import EpubBuilder
+
+
+class MobiBuilder(EpubBuilder):
+    file_extension = 'mobi'
+    isbn_field = 'isbn_mobi'
+
+    def build(self, document, use_kindlegen=False, converter_path=None, **kwargs):
+        bibl_lng = document.meta.language
+        short_lng = functions.lang_code_3to2(bibl_lng)
+        try:
+            self.hyphenator = Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
+                                       short_lng + '.dic'))
+        except:
+            pass
+
+        epub = super().build(document, **kwargs)
+
+        devnull = open("/dev/null", 'w')
+        gen_kwargs = {"stdout": devnull, "stderr": devnull}
+
+        output_file = NamedTemporaryFile(prefix='librarian', suffix='.mobi',
+                                     delete=False)
+        output_file.close()
+
+        if use_kindlegen:
+            output_file_basename = os.path.basename(output_file.name)
+            subprocess.check_call([converter_path or 'kindlegen',
+                               '-c2', epub.get_filename(),
+                               '-o', output_file_basename], **gen_kwargs)
+        else:
+            subprocess.check_call([converter_path or 'ebook-convert',
+                               epub.get_filename(),
+                               output_file.name, '--no-inline-toc',
+                               '--mobi-file-type=both',
+                               '--mobi-ignore-margins',
+                               ], **gen_kwargs)
+        return OutputFile.from_filename(output_file.name)
+
diff --git a/src/librarian/elements/__init__.py b/src/librarian/elements/__init__.py

index 6ec0ba7..c934299 100644 (file)
--- a/src/librarian/elements/__init__.py
+++ b/src/librarian/elements/__init__.py
@@ -115,7 +115,6 @@ WL_ELEMENTS = {
      "animacja": figures.Animacja,
      "ilustr": figures.Ilustr,
  
-    "ref": etree.ElementBase,
      # Section
      "wywiad_pyt": blocks.WywiadPyt,
      "wywiad_odp": blocks.WywiadOdp,
diff --git a/src/librarian/elements/base.py b/src/librarian/elements/base.py

index 3e0f898..46ae29f 100644 (file)
--- a/src/librarian/elements/base.py
+++ b/src/librarian/elements/base.py
@@ -3,7 +3,6 @@
  import re
  from lxml import etree
  from librarian import dcparser, RDFNS
-from librarian.html import raw_printable_text
  from librarian.util import get_translation
  
  
@@ -76,25 +75,35 @@ class WLElement(etree.ElementBase):
          if parent is not None:
              parent.signal(signal)
      
-    def raw_printable_text(self):
+    def raw_printable_text(self, builder):
+        from librarian.html import raw_printable_text
+
          # TODO: podtagi, wyroznienia, etc
          t = ''
-        t += self.normalize_text(self.text)
+        t += self.normalize_text(self.text, builder)
          for c in self:
              if not isinstance(c, WLElement):
                  continue
              if c.tag not in ('pe', 'pa', 'pt', 'pr', 'motyw'):
-                t += c.raw_printable_text()
-            t += self.normalize_text(c.tail)
+                t += c.raw_printable_text(builder)
+            t += self.normalize_text(c.tail, builder)
          return t
      
-    def normalize_text(self, text):
+    def normalize_text(self, text, builder):
          text = text or ''
          for e, s in self.text_substitutions:
              text = text.replace(e, s)
              # FIXME: TEmporary turnoff
  #        text = re.sub(r'\s+', ' ', text)
  ### TODO: Added now for epub
+
+        if getattr(builder, 'hyphenator', None) is not None:
+            newt = ''
+            wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(text)
+            for w in wlist:
+                newt += builder.hyphenator.inserted(w, u'\u00AD')
+            text = newt
+
          text = re.sub(r'(?<=\s\w)\s+', u'\u00A0', text)
  
          return text
@@ -102,7 +111,7 @@ class WLElement(etree.ElementBase):
      def _build_inner(self, builder, build_method):
          child_count = len(self)
          if self.CAN_HAVE_TEXT and self.text:
-            text = self.normalize_text(self.text)
+            text = self.normalize_text(self.text, builder)
              if self.STRIP:
                  text = text.lstrip()
                  if not child_count:
@@ -112,7 +121,7 @@ class WLElement(etree.ElementBase):
              if isinstance(child, WLElement):
                  getattr(child, build_method)(builder)
              if self.CAN_HAVE_TEXT and child.tail:
-                text = self.normalize_text(child.tail)
+                text = self.normalize_text(child.tail, builder)
                  if self.STRIP and i == child_count - 1:
                      text = text.rstrip()
                  builder.push_text(text)
@@ -187,7 +196,7 @@ class WLElement(etree.ElementBase):
  
              builder.add_toc_entry(
                  fragment,
-                self.raw_printable_text(),
+                self.raw_printable_text(builder),
                  self.SECTION_PRECEDENCE
              )
              
diff --git a/src/librarian/elements/styles/tytul_dziela.py b/src/librarian/elements/styles/tytul_dziela.py

index 906d98c..77852d5 100644 (file)
--- a/src/librarian/elements/styles/tytul_dziela.py
+++ b/src/librarian/elements/styles/tytul_dziela.py
@@ -6,8 +6,8 @@ class TytulDziela(WLElement):
      EPUB_TAG = HTML_TAG = 'em'
      EPUB_CLASS = HTML_CLASS = 'book-title'
  
-    def normalize_text(self, text):
-        txt = super(TytulDziela, self).normalize_text(text)
+    def normalize_text(self, text, builder):
+        txt = super(TytulDziela, self).normalize_text(text, builder)
          if self.attrib.get('typ') == '1':
              txt = '„{txt}”'.format(txt=txt)
          return txt
diff --git a/src/librarian/hyphenator.py b/src/librarian/hyphenator.py

index aa5b4c3..fdd50d4 100644 (file)
--- a/src/librarian/hyphenator.py
+++ b/src/librarian/hyphenator.py
@@ -81,13 +81,13 @@ class Hyph_dict(object):
      """
      def __init__(self, filename):
          self.patterns = {}
-        f = open(filename)
+        f = open(filename, 'rb')
          charset = f.readline().strip()
-        if charset.startswith('charset '):
+        if charset.startswith(b'charset '):
              charset = charset[8:].strip()
  
          for pat in f:
-            pat = pat.decode(charset).strip()
+            pat = pat.decode(charset.decode('latin1')).strip()
              if not pat or pat[0] == '%': continue
              # replace ^^hh with the real character
              pat = parse_hex(hexrepl, pat)
@@ -211,7 +211,7 @@ class Hyphenator(object):
          the string 'let-ter-gre-pen'. The hyphen string to use can be
          given as the second parameter, that defaults to '-'.
          """
-        if isinstance(word, str):
+        if isinstance(word, bytes):
              word = word.decode('latin1')
          l = list(word)
          for p in reversed(self.positions(word)):
author	Radek Czajka <rczajka@rczajka.pl>
	Mon, 20 Dec 2021 10:50:38 +0000 (11:50 +0100)
committer	Radek Czajka <rczajka@rczajka.pl>
	Mon, 20 Dec 2021 10:50:38 +0000 (11:50 +0100)
src/librarian/builders/__init__.py		patch \| blob \| history
src/librarian/builders/epub.py		patch \| blob \| history
src/librarian/builders/mobi.py	[new file with mode: 0644]	patch \| blob
src/librarian/elements/__init__.py		patch \| blob \| history
src/librarian/elements/base.py		patch \| blob \| history
src/librarian/elements/styles/tytul_dziela.py		patch \| blob \| history
src/librarian/hyphenator.py		patch \| blob \| history