Add epub debugging. 23.12
authorRadek Czajka <rczajka@rczajka.pl>
Tue, 5 Dec 2023 13:58:15 +0000 (14:58 +0100)
committerRadek Czajka <rczajka@rczajka.pl>
Tue, 5 Dec 2023 13:58:15 +0000 (14:58 +0100)
CHANGELOG.md
setup.py
src/librarian/builders/epub.py
src/librarian/elements/base.py
src/librarian/epubcheck.py [new file with mode: 0644]

index f523442..3a93be3 100644 (file)
@@ -2,6 +2,10 @@
 
 This document records all notable changes to Librarian.
 
+## 23.12
+
+- Added debug version for epub and epubcheck utility.
+
 ## 23.10
 
 - Remove direct verse styling from HTML.
index b14204f..cddab59 100755 (executable)
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@ def whole_tree(prefix, path):
 
 setup(
     name='librarian',
-    version='23.10',
+    version='23.12',
     description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats',
     author="Marek StÄ™pniowski",
     author_email='marek@stepniowski.com',
index 8f21879..e009d4a 100644 (file)
@@ -4,6 +4,7 @@
 from datetime import date
 import io
 import os
+import re
 import tempfile
 from ebooklib import epub
 from lxml import etree
@@ -80,9 +81,11 @@ class EpubBuilder(Builder):
     isbn_field = 'isbn_epub'
     orphans = True
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, debug=False, **kwargs):
         self.chars = set()
         self.fundr = 0
+        self.debug = debug
+        self.splits = []
         super().__init__(*args, **kwargs)
     
     def build(self, document, **kwargs):
@@ -707,3 +710,8 @@ class EpubBuilder(Builder):
             file_name=name
         )
         return name
+
+    def process_comment(self, comment):
+        m = re.match(r'TRIM:(\d+)', comment.text)
+        if m is not None:
+            self.splits.append(comment.sourceline - int(m.group(1)))
index 863436c..b0d16ed 100644 (file)
@@ -136,6 +136,8 @@ class WLElement(etree.ElementBase):
         for i, child in enumerate(self):
             if isinstance(child, WLElement):
                 getattr(child, build_method)(builder)
+            elif getattr(builder, 'debug') and child.tag is etree.Comment:
+                builder.process_comment(child)
             if self.CAN_HAVE_TEXT and child.tail:
                 text = self.normalize_text(child.tail, builder)
                 if self.STRIP and i == child_count - 1:
@@ -198,7 +200,7 @@ class WLElement(etree.ElementBase):
         # TEMPORARY
         self.CAN_HAVE_TEXT = True
         self.STRIP = False
-       
+
         start_chunk = self.EPUB_START_CHUNK and isinstance(self.getparent(), Master)
 
         if start_chunk:
@@ -220,6 +222,11 @@ class WLElement(etree.ElementBase):
             attr = self.get_epub_attr(builder)
             if fragment:
                 attr['id'] = fragment
+            if builder.debug:
+                chunkno, sourceline = 0, self.sourceline
+                if builder.splits:
+                    chunkno, sourceline = len(builder.splits), sourceline - builder.splits[-1]
+                attr['data-debug'] = f'{chunkno}:{sourceline}'
             builder.start_element(
                 self.EPUB_TAG,
                 attr
diff --git a/src/librarian/epubcheck.py b/src/librarian/epubcheck.py
new file mode 100644 (file)
index 0000000..00bfb13
--- /dev/null
@@ -0,0 +1,33 @@
+import json
+import re
+import subprocess
+import zipfile
+
+
+def epubcheck(filename):
+    p = subprocess.run(
+        [
+            'epubcheck', '-q',
+            '-j', '-',
+            filename
+        ],
+        capture_output=True
+    )
+    output = json.loads(p.stdout)
+    epub = zipfile.ZipFile(filename)
+    messages = output.get('messages', [])
+    for message in messages:
+        for loc in message.get('locations', []):
+            if loc['path'].startswith('EPUB/part'):
+                with epub.open(loc['path']) as zfile:
+                    text = zfile.read().decode('utf-8')
+                line = text.split('\n')[loc['line'] - 1][:loc['column'] - 1:]
+                debug = re.findall(r' data-debug="(\d+):(\d+)', line)
+                if debug:
+                    debug = debug[-1]
+                    loc['wl_chunk'] = int(debug[0])
+                    loc['wl_line'] = int(debug[1])
+    return messages
+
+            
+