From 77004f7537a472419bf9701c8522a774a0c29245 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Tue, 5 Dec 2023 14:58:15 +0100 Subject: [PATCH] Add epub debugging. --- CHANGELOG.md | 4 ++++ setup.py | 2 +- src/librarian/builders/epub.py | 10 +++++++++- src/librarian/elements/base.py | 9 ++++++++- src/librarian/epubcheck.py | 33 +++++++++++++++++++++++++++++++++ 5 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 src/librarian/epubcheck.py diff --git a/CHANGELOG.md b/CHANGELOG.md index f523442..3a93be3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ This document records all notable changes to Librarian. +## 23.12 + +- Added debug version for epub and epubcheck utility. + ## 23.10 - Remove direct verse styling from HTML. diff --git a/setup.py b/setup.py index b14204f..cddab59 100755 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ def whole_tree(prefix, path): setup( name='librarian', - version='23.10', + version='23.12', description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats', author="Marek Stępniowski", author_email='marek@stepniowski.com', diff --git a/src/librarian/builders/epub.py b/src/librarian/builders/epub.py index 8f21879..e009d4a 100644 --- a/src/librarian/builders/epub.py +++ b/src/librarian/builders/epub.py @@ -4,6 +4,7 @@ from datetime import date import io import os +import re import tempfile from ebooklib import epub from lxml import etree @@ -80,9 +81,11 @@ class EpubBuilder(Builder): isbn_field = 'isbn_epub' orphans = True - def __init__(self, *args, **kwargs): + def __init__(self, *args, debug=False, **kwargs): self.chars = set() self.fundr = 0 + self.debug = debug + self.splits = [] super().__init__(*args, **kwargs) def build(self, document, **kwargs): @@ -707,3 +710,8 @@ class EpubBuilder(Builder): file_name=name ) return name + + def process_comment(self, comment): + m = re.match(r'TRIM:(\d+)', comment.text) + if m is not None: + self.splits.append(comment.sourceline - int(m.group(1))) diff --git a/src/librarian/elements/base.py b/src/librarian/elements/base.py index 863436c..b0d16ed 100644 --- a/src/librarian/elements/base.py +++ b/src/librarian/elements/base.py @@ -136,6 +136,8 @@ class WLElement(etree.ElementBase): for i, child in enumerate(self): if isinstance(child, WLElement): getattr(child, build_method)(builder) + elif getattr(builder, 'debug') and child.tag is etree.Comment: + builder.process_comment(child) if self.CAN_HAVE_TEXT and child.tail: text = self.normalize_text(child.tail, builder) if self.STRIP and i == child_count - 1: @@ -198,7 +200,7 @@ class WLElement(etree.ElementBase): # TEMPORARY self.CAN_HAVE_TEXT = True self.STRIP = False - + start_chunk = self.EPUB_START_CHUNK and isinstance(self.getparent(), Master) if start_chunk: @@ -220,6 +222,11 @@ class WLElement(etree.ElementBase): attr = self.get_epub_attr(builder) if fragment: attr['id'] = fragment + if builder.debug: + chunkno, sourceline = 0, self.sourceline + if builder.splits: + chunkno, sourceline = len(builder.splits), sourceline - builder.splits[-1] + attr['data-debug'] = f'{chunkno}:{sourceline}' builder.start_element( self.EPUB_TAG, attr diff --git a/src/librarian/epubcheck.py b/src/librarian/epubcheck.py new file mode 100644 index 0000000..00bfb13 --- /dev/null +++ b/src/librarian/epubcheck.py @@ -0,0 +1,33 @@ +import json +import re +import subprocess +import zipfile + + +def epubcheck(filename): + p = subprocess.run( + [ + 'epubcheck', '-q', + '-j', '-', + filename + ], + capture_output=True + ) + output = json.loads(p.stdout) + epub = zipfile.ZipFile(filename) + messages = output.get('messages', []) + for message in messages: + for loc in message.get('locations', []): + if loc['path'].startswith('EPUB/part'): + with epub.open(loc['path']) as zfile: + text = zfile.read().decode('utf-8') + line = text.split('\n')[loc['line'] - 1][:loc['column'] - 1:] + debug = re.findall(r' data-debug="(\d+):(\d+)', line) + if debug: + debug = debug[-1] + loc['wl_chunk'] = int(debug[0]) + loc['wl_line'] = int(debug[1]) + return messages + + + -- 2.20.1