From 295a1350b069ee750d2f7430e779e5ac6f23a673 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Thu, 15 Jun 2023 14:38:38 +0200 Subject: [PATCH] Add snippets. --- setup.py | 2 +- src/librarian/builders/__init__.py | 3 +- src/librarian/builders/html.py | 14 ++- src/librarian/document.py | 2 + src/librarian/elements/__init__.py | 3 +- src/librarian/elements/base.py | 92 ++++++++++++++++++- src/librarian/elements/footnotes/__init__.py | 1 + src/librarian/elements/ref/__init__.py | 11 ++- src/librarian/elements/themes/motyw.py | 1 + .../text/asnyk_miedzy_nami_expected.html | 2 +- .../asnyk_miedzy_nami_expected.legacy.html | 2 +- .../text/asnyk_miedzy_nami_fragments.html | 4 +- tests/files/text/miedzy-nami-nic-nie-bylo.xml | 2 +- 13 files changed, 124 insertions(+), 15 deletions(-) diff --git a/setup.py b/setup.py index 4c3412a..b87a856 100755 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ def whole_tree(prefix, path): setup( name='librarian', - version='2.4.13', + version='2.5', description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats', author="Marek Stępniowski", author_email='marek@stepniowski.com', diff --git a/src/librarian/builders/__init__.py b/src/librarian/builders/__init__.py index d8acb82..5e3266c 100644 --- a/src/librarian/builders/__init__.py +++ b/src/librarian/builders/__init__.py @@ -1,6 +1,6 @@ from collections import OrderedDict from .txt import TxtBuilder -from .html import HtmlBuilder, StandaloneHtmlBuilder, DaisyHtmlBuilder +from .html import HtmlBuilder, SnippetHtmlBuilder, StandaloneHtmlBuilder, DaisyHtmlBuilder from .sanitize import Sanitizer from .daisy import DaisyBuilder from .epub import EpubBuilder @@ -11,6 +11,7 @@ from .pdf import PdfBuilder builders = OrderedDict([ ("txt", TxtBuilder), ("html", HtmlBuilder), + ("html-snippet", SnippetHtmlBuilder), ("html-standalone", StandaloneHtmlBuilder), ("html-daisy", DaisyHtmlBuilder), ("daisy", DaisyBuilder), diff --git a/src/librarian/builders/html.py b/src/librarian/builders/html.py index ed222d3..18a5b36 100644 --- a/src/librarian/builders/html.py +++ b/src/librarian/builders/html.py @@ -24,7 +24,7 @@ class HtmlBuilder: self._base_url = base_url self.tree = text = etree.Element('div', **{'id': 'book-text'}) - self.header = etree.SubElement(text, 'h1') + self.header = etree.Element('h1') self.footnotes = etree.Element('div', id='footnotes') self.footnote_counter = 0 @@ -109,6 +109,9 @@ class HtmlBuilder: if self.with_toc: add_table_of_contents(self.tree) + if len(self.header): + self.tree.insert(0, self.header) + if self.footnote_counter: fnheader = etree.Element("h3") fnheader.text = _("Footnotes") @@ -183,6 +186,15 @@ class StandaloneHtmlBuilder(HtmlBuilder): ) +class SnippetHtmlBuilder(HtmlBuilder): + with_anchors = False + with_themes = False + with_toc = False + with_footnotes = False + with_nota_red = False + with_refs = False + + class DaisyHtmlBuilder(StandaloneHtmlBuilder): file_extension = 'xhtml' with_anchors = False diff --git a/src/librarian/document.py b/src/librarian/document.py index 6ac2842..d4063c5 100644 --- a/src/librarian/document.py +++ b/src/librarian/document.py @@ -115,3 +115,5 @@ class WLDocument: persons.remove(None) return persons + def references(self): + return self.tree.findall('.//ref') diff --git a/src/librarian/elements/__init__.py b/src/librarian/elements/__init__.py index c715e3a..ff4eb43 100644 --- a/src/librarian/elements/__init__.py +++ b/src/librarian/elements/__init__.py @@ -1,10 +1,11 @@ from lxml import etree from . import (blocks, comments, drama, figures, footnotes, front, headers, masters, paragraphs, poetry, ref, root, separators, styles, themes, - tools) + tools, base) WL_ELEMENTS = { + 'snippet': base.Snippet, 'meta': etree.ElementBase, 'coverClass': etree.ElementBase, "developmentStage": etree.ElementBase, diff --git a/src/librarian/elements/base.py b/src/librarian/elements/base.py index 6036d16..646067e 100644 --- a/src/librarian/elements/base.py +++ b/src/librarian/elements/base.py @@ -1,13 +1,25 @@ -# -*- coding: utf-8 - +import copy import re from lxml import etree from librarian import dcparser, RDFNS from librarian.util import get_translation +def last_words(text, n): + words = [] + for w in reversed(text.split()): + words.append(w) + if len(w) > 2: + n -= 1 + if not n: break + if n: + return n, text + else: + return n, ' '.join(reversed(words)) + class WLElement(etree.ElementBase): SECTION_PRECEDENCE = None + ASIDE = False TXT_TOP_MARGIN = 0 TXT_BOTTOM_MARGIN = 0 @@ -153,7 +165,7 @@ class WLElement(etree.ElementBase): # always copy the id attribute (?) if self.attrib.get('id'): attr['id'] = self.attrib['id'] - elif '_compat_section_id' in self.attrib: + elif getattr(self, 'SHOULD_HAVE_ID', False) and '_compat_section_id' in self.attrib: attr['id'] = self.attrib['_compat_section_id'] return attr @@ -234,3 +246,77 @@ class WLElement(etree.ElementBase): for e in self: if isinstance(e, WLElement): e.sanitize() + + def snip(self, words, before=None, sub=False): + if sub and self.ASIDE: + return words, [] + + snippet = [] + if before is not None: + i = self.index(before) + else: + i = len(self) + + while i > 0: + i -= 1 + if self[i].tail: + if words: + words, text = last_words(self[i].tail, words) + snippet = [('text', text)] + snippet + + if words: + words, subsnip = self[i].snip(words, sub=True) + snippet = subsnip + snippet + + if words and self.text: + words, text = last_words(self.text, words) + snippet = [('text', text)] + snippet + + snippet = [('start', self.tag, self.attrib)] + snippet + [('end',)] + + if not sub and words and not self.ASIDE: + # do we dare go up? + parent = self.getparent() + if parent is not None and parent.CAN_HAVE_TEXT: + print(etree.tostring(self, encoding='unicode')) + assert False + words, parsnip = parent.snip(words, before=self) + return words, parsnip[:-1] + snippet + parsnip[-1:] + + return words, snippet + + def get_snippet(self, words=15): + from librarian.parser import parser + + words, snippet = self.getparent().snip(words=words, before=self) + + cursor = snipelem = parser.makeelement('snippet') + snipelem._meta_object = self.meta + for s in snippet: + if s[0] == 'start': + elem = parser.makeelement(s[1], **s[2]) + cursor.append(elem) + cursor = elem + elif s[0] == 'end': + cursor = cursor.getparent() + else: + if len(cursor): + cursor[-1].tail = (cursor[-1].tail or '') + s[1] + else: + cursor.text = (cursor.text or '') + s[1] + + return snipelem + + def get_link(self): + sec = getattr(self, 'SHOULD_HAVE_ID', False) and self.attrib.get('_compat_section_id') + if sec: + return sec + parent_index = self.getparent().index(self) + if parent_index: + return self.getparent()[parent_index - 1].get_link() + else: + return self.getparent().get_link() + + +class Snippet(WLElement): + pass diff --git a/src/librarian/elements/footnotes/__init__.py b/src/librarian/elements/footnotes/__init__.py index 398fdea..e18eaae 100644 --- a/src/librarian/elements/footnotes/__init__.py +++ b/src/librarian/elements/footnotes/__init__.py @@ -7,6 +7,7 @@ from ..base import WLElement class Footnote(WLElement): NO_TOC = True START_INLINE = True + ASIDE = True def signal(self, signal): if signal == 'INLINE': diff --git a/src/librarian/elements/ref/__init__.py b/src/librarian/elements/ref/__init__.py index 2019da1..7a72a38 100644 --- a/src/librarian/elements/ref/__init__.py +++ b/src/librarian/elements/ref/__init__.py @@ -2,12 +2,17 @@ from ..base import WLElement class Ref(WLElement): + ASIDE = True + HTML_TAG = 'a' + def txt_build(self, builder): pass - def html_build(self, builder): - pass + def get_html_attr(self, builder): + return { + 'class': 'reference', + 'data-uri': self.attrib.get('href', ''), + } def epub_build(self, builder): pass - diff --git a/src/librarian/elements/themes/motyw.py b/src/librarian/elements/themes/motyw.py index 25369a7..9f6a886 100644 --- a/src/librarian/elements/themes/motyw.py +++ b/src/librarian/elements/themes/motyw.py @@ -2,6 +2,7 @@ from ..base import WLElement class Motyw(WLElement): + ASIDE = True HTML_TAG = "a" def txt_build(self, builder): diff --git a/tests/files/text/asnyk_miedzy_nami_expected.html b/tests/files/text/asnyk_miedzy_nami_expected.html index 99c729a..91c1135 100644 --- a/tests/files/text/asnyk_miedzy_nami_expected.html +++ b/tests/files/text/asnyk_miedzy_nami_expected.html @@ -34,7 +34,7 @@
Prócz tych wspólnych, jasnych zdrojów,
Z których serce zachwyt piło;
- 15
Prócz pierwiosnków i powojów,—
+ 15
Prócz pierwiosnków i powojów,—
Między nami nic nie było!
diff --git a/tests/files/text/asnyk_miedzy_nami_expected.legacy.html b/tests/files/text/asnyk_miedzy_nami_expected.legacy.html index dce71bb..a0eba08 100644 --- a/tests/files/text/asnyk_miedzy_nami_expected.legacy.html +++ b/tests/files/text/asnyk_miedzy_nami_expected.legacy.html @@ -34,7 +34,7 @@
Prócz tych wspólnych, jasnych zdrojów,
Z których serce zachwyt piło;
- 15
Prócz pierwiosnków i powojów,—
+ 15
Prócz pierwiosnków i powojów,—
Między nami nic nie było!
diff --git a/tests/files/text/asnyk_miedzy_nami_fragments.html b/tests/files/text/asnyk_miedzy_nami_fragments.html index 2a5713c..eeeaee6 100644 --- a/tests/files/text/asnyk_miedzy_nami_fragments.html +++ b/tests/files/text/asnyk_miedzy_nami_fragments.html @@ -29,7 +29,7 @@ Prócz tych woni, barw i blasków,
Prócz tych wspólnych, jasnych zdrojów,
Z których serce zachwyt piło;
-
Prócz pierwiosnków i powojów,—
+
Prócz pierwiosnków i powojów,—
Między nami nic nie było!
@@ -49,6 +49,6 @@ Prócz tych woni, barw i blasków,
Prócz tych wspólnych, jasnych zdrojów,
Z których serce zachwyt piło;
-
Prócz pierwiosnków i powojów,—
+
Prócz pierwiosnków i powojów,—
Między nami nic nie było!
diff --git a/tests/files/text/miedzy-nami-nic-nie-bylo.xml b/tests/files/text/miedzy-nami-nic-nie-bylo.xml index 8036fce..84f2911 100644 --- a/tests/files/text/miedzy-nami-nic-nie-bylo.xml +++ b/tests/files/text/miedzy-nami-nic-nie-bylo.xml @@ -60,7 +60,7 @@ Prócz natury słodkich czarów; Prócz tych wspólnych, jasnych zdrojów,/ Z których serce zachwyt piło;/ -Prócz pierwiosnków i powojów,---/ +Prócz pierwiosnkówpierwiosnek --- taki kwiatek i powojów,---/ Między nami nic nie było! -- 2.20.1