Add epub debugging.

author Radek Czajka <rczajka@rczajka.pl>

Tue, 5 Dec 2023 13:58:15 +0000 (14:58 +0100)

committer Radek Czajka <rczajka@rczajka.pl>

Tue, 5 Dec 2023 13:58:15 +0000 (14:58 +0100)
author Radek Czajka <rczajka@rczajka.pl>
Tue, 5 Dec 2023 13:58:15 +0000 (14:58 +0100)
committer Radek Czajka <rczajka@rczajka.pl>
Tue, 5 Dec 2023 13:58:15 +0000 (14:58 +0100)
diff --git a/CHANGELOG.md b/CHANGELOG.md

index f523442..3a93be3 100644 (file)
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,10 @@
  
  This document records all notable changes to Librarian.
  
+## 23.12
+
+- Added debug version for epub and epubcheck utility.
+
  ## 23.10
  
  - Remove direct verse styling from HTML.
diff --git a/setup.py b/setup.py

index b14204f..cddab59 100755 (executable)
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@ def whole_tree(prefix, path):
  
  setup(
      name='librarian',
-    version='23.10',
+    version='23.12',
      description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats',
      author="Marek Stępniowski",
      author_email='marek@stepniowski.com',
diff --git a/src/librarian/builders/epub.py b/src/librarian/builders/epub.py

index 8f21879..e009d4a 100644 (file)
--- a/src/librarian/builders/epub.py
+++ b/src/librarian/builders/epub.py
@@ -4,6 +4,7 @@
  from datetime import date
  import io
  import os
+import re
  import tempfile
  from ebooklib import epub
  from lxml import etree
@@ -80,9 +81,11 @@ class EpubBuilder(Builder):
      isbn_field = 'isbn_epub'
      orphans = True
  
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, debug=False, **kwargs):
          self.chars = set()
          self.fundr = 0
+        self.debug = debug
+        self.splits = []
          super().__init__(*args, **kwargs)
      
      def build(self, document, **kwargs):
@@ -707,3 +710,8 @@ class EpubBuilder(Builder):
              file_name=name
          )
          return name
+
+    def process_comment(self, comment):
+        m = re.match(r'TRIM:(\d+)', comment.text)
+        if m is not None:
+            self.splits.append(comment.sourceline - int(m.group(1)))
diff --git a/src/librarian/elements/base.py b/src/librarian/elements/base.py

index 863436c..b0d16ed 100644 (file)
--- a/src/librarian/elements/base.py
+++ b/src/librarian/elements/base.py
@@ -136,6 +136,8 @@ class WLElement(etree.ElementBase):
          for i, child in enumerate(self):
              if isinstance(child, WLElement):
                  getattr(child, build_method)(builder)
+            elif getattr(builder, 'debug') and child.tag is etree.Comment:
+                builder.process_comment(child)
              if self.CAN_HAVE_TEXT and child.tail:
                  text = self.normalize_text(child.tail, builder)
                  if self.STRIP and i == child_count - 1:
@@ -198,7 +200,7 @@ class WLElement(etree.ElementBase):
          # TEMPORARY
          self.CAN_HAVE_TEXT = True
          self.STRIP = False
-       
+
          start_chunk = self.EPUB_START_CHUNK and isinstance(self.getparent(), Master)
  
          if start_chunk:
@@ -220,6 +222,11 @@ class WLElement(etree.ElementBase):
              attr = self.get_epub_attr(builder)
              if fragment:
                  attr['id'] = fragment
+            if builder.debug:
+                chunkno, sourceline = 0, self.sourceline
+                if builder.splits:
+                    chunkno, sourceline = len(builder.splits), sourceline - builder.splits[-1]
+                attr['data-debug'] = f'{chunkno}:{sourceline}'
              builder.start_element(
                  self.EPUB_TAG,
                  attr
diff --git a/src/librarian/epubcheck.py b/src/librarian/epubcheck.py

new file mode 100644 (file)

index 0000000..00bfb13
--- /dev/null
+++ b/src/librarian/epubcheck.py
@@ -0,0 +1,33 @@
+import json
+import re
+import subprocess
+import zipfile
+
+
+def epubcheck(filename):
+    p = subprocess.run(
+        [
+            'epubcheck', '-q',
+            '-j', '-',
+            filename
+        ],
+        capture_output=True
+    )
+    output = json.loads(p.stdout)
+    epub = zipfile.ZipFile(filename)
+    messages = output.get('messages', [])
+    for message in messages:
+        for loc in message.get('locations', []):
+            if loc['path'].startswith('EPUB/part'):
+                with epub.open(loc['path']) as zfile:
+                    text = zfile.read().decode('utf-8')
+                line = text.split('\n')[loc['line'] - 1][:loc['column'] - 1:]
+                debug = re.findall(r' data-debug="(\d+):(\d+)', line)
+                if debug:
+                    debug = debug[-1]
+                    loc['wl_chunk'] = int(debug[0])
+                    loc['wl_line'] = int(debug[1])
+    return messages
+
+            
+
author	Radek Czajka <rczajka@rczajka.pl>
	Tue, 5 Dec 2023 13:58:15 +0000 (14:58 +0100)
committer	Radek Czajka <rczajka@rczajka.pl>
	Tue, 5 Dec 2023 13:58:15 +0000 (14:58 +0100)
CHANGELOG.md		patch \| blob \| history
setup.py		patch \| blob \| history
src/librarian/builders/epub.py		patch \| blob \| history
src/librarian/elements/base.py		patch \| blob \| history
src/librarian/epubcheck.py	[new file with mode: 0644]	patch \| blob