From 0604bdd5f693da9f1c78f9d9fa2276f0c7b6c17b Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Mon, 31 Aug 2020 15:29:43 +0200 Subject: [PATCH] New Element-based builder API (WiP). --- setup.py | 5 + src/librarian/builders/__init__.py | 14 ++ src/librarian/builders/html.py | 72 ++++++++ src/librarian/builders/sanitize.py | 18 ++ src/librarian/builders/txt.py | 167 ++++++++++++++++++ src/librarian/command_line.py | 43 +++++ src/librarian/dcparser.py | 10 +- src/librarian/document.py | 24 +++ src/librarian/elements/__init__.py | 115 ++++++++++++ src/librarian/elements/base.py | 139 +++++++++++++++ src/librarian/elements/blocks/__init__.py | 3 + src/librarian/elements/blocks/dlugi_cytat.py | 10 ++ src/librarian/elements/blocks/nota.py | 5 + src/librarian/elements/blocks/poezja_cyt.py | 10 ++ src/librarian/elements/comments/__init__.py | 2 + src/librarian/elements/comments/abstrakt.py | 9 + src/librarian/elements/comments/uwaga.py | 7 + src/librarian/elements/drama/__init__.py | 8 + src/librarian/elements/drama/didask_tekst.py | 6 + src/librarian/elements/drama/didaskalia.py | 11 ++ src/librarian/elements/drama/kwestia.py | 6 + src/librarian/elements/drama/lista_osob.py | 11 ++ src/librarian/elements/drama/lista_osoba.py | 10 ++ .../elements/drama/naglowek_listy.py | 5 + .../elements/drama/naglowek_osoba.py | 9 + src/librarian/elements/drama/osoba.py | 6 + src/librarian/elements/figures/__init__.py | 1 + src/librarian/elements/figures/ilustr.py | 10 ++ src/librarian/elements/footnotes/__init__.py | 7 + src/librarian/elements/front/__init__.py | 6 + src/librarian/elements/front/autor_utworu.py | 8 + src/librarian/elements/front/base.py | 15 ++ .../elements/front/dzielo_nadrzedne.py | 6 + src/librarian/elements/front/motto.py | 6 + src/librarian/elements/front/motto_podpis.py | 5 + src/librarian/elements/front/nazwa_utworu.py | 9 + src/librarian/elements/front/podtytul.py | 8 + src/librarian/elements/headers/__init__.py | 3 + .../elements/headers/naglowek_czesc.py | 8 + .../elements/headers/naglowek_podrozdzial.py | 8 + .../elements/headers/naglowek_rozdzial.py | 10 ++ src/librarian/elements/masters/__init__.py | 7 + src/librarian/elements/paragraphs/__init__.py | 1 + src/librarian/elements/paragraphs/akap.py | 15 ++ src/librarian/elements/poetry/__init__.py | 5 + src/librarian/elements/poetry/strofa.py | 50 ++++++ src/librarian/elements/poetry/wers.py | 13 ++ src/librarian/elements/poetry/wers_cd.py | 6 + src/librarian/elements/poetry/wers_wciety.py | 16 ++ .../elements/poetry/zastepnik_wersu.py | 5 + src/librarian/elements/root/__init__.py | 19 ++ src/librarian/elements/separators/__init__.py | 3 + .../elements/separators/sekcja_asterysk.py | 11 ++ .../elements/separators/sekcja_swiatlo.py | 7 + .../elements/separators/separator_linia.py | 11 ++ src/librarian/elements/styles/__init__.py | 3 + src/librarian/elements/styles/slowo_obce.py | 5 + src/librarian/elements/styles/tytul_dziela.py | 10 ++ src/librarian/elements/styles/wyroznienie.py | 7 + src/librarian/elements/themes/__init__.py | 3 + src/librarian/elements/themes/begin.py | 5 + src/librarian/elements/themes/end.py | 5 + src/librarian/elements/themes/motyw.py | 14 ++ src/librarian/parser.py | 20 +++ tests/test_text.py | 23 ++- tox.ini | 2 +- 66 files changed, 1090 insertions(+), 11 deletions(-) create mode 100644 src/librarian/builders/__init__.py create mode 100644 src/librarian/builders/html.py create mode 100644 src/librarian/builders/sanitize.py create mode 100644 src/librarian/builders/txt.py create mode 100644 src/librarian/command_line.py create mode 100644 src/librarian/document.py create mode 100644 src/librarian/elements/__init__.py create mode 100644 src/librarian/elements/base.py create mode 100644 src/librarian/elements/blocks/__init__.py create mode 100644 src/librarian/elements/blocks/dlugi_cytat.py create mode 100644 src/librarian/elements/blocks/nota.py create mode 100644 src/librarian/elements/blocks/poezja_cyt.py create mode 100644 src/librarian/elements/comments/__init__.py create mode 100644 src/librarian/elements/comments/abstrakt.py create mode 100644 src/librarian/elements/comments/uwaga.py create mode 100644 src/librarian/elements/drama/__init__.py create mode 100644 src/librarian/elements/drama/didask_tekst.py create mode 100644 src/librarian/elements/drama/didaskalia.py create mode 100644 src/librarian/elements/drama/kwestia.py create mode 100644 src/librarian/elements/drama/lista_osob.py create mode 100644 src/librarian/elements/drama/lista_osoba.py create mode 100644 src/librarian/elements/drama/naglowek_listy.py create mode 100644 src/librarian/elements/drama/naglowek_osoba.py create mode 100644 src/librarian/elements/drama/osoba.py create mode 100644 src/librarian/elements/figures/__init__.py create mode 100644 src/librarian/elements/figures/ilustr.py create mode 100644 src/librarian/elements/footnotes/__init__.py create mode 100644 src/librarian/elements/front/__init__.py create mode 100644 src/librarian/elements/front/autor_utworu.py create mode 100644 src/librarian/elements/front/base.py create mode 100644 src/librarian/elements/front/dzielo_nadrzedne.py create mode 100644 src/librarian/elements/front/motto.py create mode 100644 src/librarian/elements/front/motto_podpis.py create mode 100644 src/librarian/elements/front/nazwa_utworu.py create mode 100644 src/librarian/elements/front/podtytul.py create mode 100644 src/librarian/elements/headers/__init__.py create mode 100644 src/librarian/elements/headers/naglowek_czesc.py create mode 100644 src/librarian/elements/headers/naglowek_podrozdzial.py create mode 100644 src/librarian/elements/headers/naglowek_rozdzial.py create mode 100644 src/librarian/elements/masters/__init__.py create mode 100644 src/librarian/elements/paragraphs/__init__.py create mode 100644 src/librarian/elements/paragraphs/akap.py create mode 100644 src/librarian/elements/poetry/__init__.py create mode 100644 src/librarian/elements/poetry/strofa.py create mode 100644 src/librarian/elements/poetry/wers.py create mode 100644 src/librarian/elements/poetry/wers_cd.py create mode 100644 src/librarian/elements/poetry/wers_wciety.py create mode 100644 src/librarian/elements/poetry/zastepnik_wersu.py create mode 100644 src/librarian/elements/root/__init__.py create mode 100644 src/librarian/elements/separators/__init__.py create mode 100644 src/librarian/elements/separators/sekcja_asterysk.py create mode 100644 src/librarian/elements/separators/sekcja_swiatlo.py create mode 100644 src/librarian/elements/separators/separator_linia.py create mode 100644 src/librarian/elements/styles/__init__.py create mode 100644 src/librarian/elements/styles/slowo_obce.py create mode 100644 src/librarian/elements/styles/tytul_dziela.py create mode 100644 src/librarian/elements/styles/wyroznienie.py create mode 100644 src/librarian/elements/themes/__init__.py create mode 100644 src/librarian/elements/themes/begin.py create mode 100644 src/librarian/elements/themes/end.py create mode 100644 src/librarian/elements/themes/motyw.py diff --git a/setup.py b/setup.py index 0466e08..1ddf324 100755 --- a/setup.py +++ b/setup.py @@ -41,6 +41,11 @@ setup( 'texml', 'ebooklib', ], + entry_points = { + "console_scripts": [ + "librarian=librarian.command_line:main" + ] + }, scripts=['scripts/book2html', 'scripts/book2txt', 'scripts/book2epub', diff --git a/src/librarian/builders/__init__.py b/src/librarian/builders/__init__.py new file mode 100644 index 0000000..fcd9194 --- /dev/null +++ b/src/librarian/builders/__init__.py @@ -0,0 +1,14 @@ +from .txt import TxtBuilder +from .html import HtmlBuilder +from .sanitize import Sanitizer + + +builders = [ + TxtBuilder, + HtmlBuilder, + Sanitizer, +] + + +def get_builder_class(builder_id): + return next(b for b in builders if b.identifier == builder_id) diff --git a/src/librarian/builders/html.py b/src/librarian/builders/html.py new file mode 100644 index 0000000..8015c6a --- /dev/null +++ b/src/librarian/builders/html.py @@ -0,0 +1,72 @@ +from lxml import etree +from librarian import OutputFile + + +class HtmlBuilder: + file_extension = "html" + identifier = "html" + + def __init__(self, image_location='https://wolnelektury.pl/media/book/pictures/marcos-historia-kolorow/'): + self.image_location = image_location + + #self.tree = etree.Element('html') + #body = etree.SubElement(self.tree, 'body') + #text = etree.SubElement(body, 'div', **{'id': 'book-text'}) + self.tree = text = etree.Element('div', **{'id': 'book-text'}) + toc = etree.SubElement(text, 'div', id='toc') + themes = etree.SubElement(text, 'div', id='themes') + h1 = etree.SubElement(text, 'h1') + + self.cursors = { + None: text, + 'toc': toc, + 'themes': themes, + 'header': h1, + } + self.current_cursors = [None] + + def enter_fragment(self, fragment): + self.current_cursors.append(fragment) + + def exit_fragment(self): + self.current_cursors.pop() + + def build(self, document): + document.tree.getroot().html_build(self) + + head = etree.Element('head') + self.tree.insert(0, head) + etree.SubElement( + head, + 'link', + href="https://static.wolnelektury.pl/css/compressed/book_text.b15153e56c0a.css", + rel="stylesheet", + type="text/css", + ) + + return OutputFile.from_bytes( + etree.tostring( + self.tree, + method='html', + encoding='utf-8', + pretty_print=True + ) + ) + + def start_element(self, tag, attrib): + self.cursors[self.current_cursors[-1]] = etree.SubElement( + self.cursors[self.current_cursors[-1]], + tag, + **attrib + ) + print(self.cursors) + + def end_element(self): + self.cursors[self.current_cursors[-1]] = self.cursors[self.current_cursors[-1]].getparent() + + def push_text(self, text): + cursor = self.cursors[self.current_cursors[-1]] + if len(cursor): + cursor.tail = (cursor[-1].tail or '') + text + else: + cursor.text = (cursor.text or '') + text diff --git a/src/librarian/builders/sanitize.py b/src/librarian/builders/sanitize.py new file mode 100644 index 0000000..4d7f7f9 --- /dev/null +++ b/src/librarian/builders/sanitize.py @@ -0,0 +1,18 @@ +from lxml import etree +from librarian import OutputFile + + +class Sanitizer: + identifier = 'sanitize' + file_extension = 'xml2' + + def build(self, document): + doc = document.tree.getroot() # TODO: copy + doc.sanitize() + return OutputFile.from_bytes( + etree.tostring( + doc, + encoding='utf-8', + ) + ) + diff --git a/src/librarian/builders/txt.py b/src/librarian/builders/txt.py new file mode 100644 index 0000000..4023814 --- /dev/null +++ b/src/librarian/builders/txt.py @@ -0,0 +1,167 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import io +from librarian import OutputFile, get_resource + + +with io.open(get_resource("res/text/template.txt")) as f: + TEMPLATE = f.read() + + +class TxtFragment: + def __init__(self): + self.pieces = [] + self.current_margin = 0 + self.starting_block = True + + def push_legacy_margin(self, margin): + if margin: + if self.pieces: + self.pieces[-1] = self.pieces[-1].rstrip(' ') + self.pieces.append('\r\n' * margin) + self.current_margin += margin + self.starting_block = True + + def push_margin(self, margin): + if margin: + if self.pieces: + self.pieces[-1] = self.pieces[-1].rstrip(' ') + if margin > self.current_margin: + self.pieces.append('\r\n' * (margin - self.current_margin)) + self.current_margin = margin + self.starting_block = True + + def push_text(self, text, prepared=False): + if text: + if self.starting_block and not prepared: + text = text.lstrip() + self.pieces.append(text) + self.current_margin = 0 + if not prepared: + self.starting_block = False + + +class TxtBuilder: + """ + """ + file_extension = "txt" + identifier = "txt" + + default_license_description = { + "pol": ( + "Ten utwór nie jest objęty majątkowym prawem autorskim " + "i znajduje się w domenie publicznej, co oznacza że " + "możesz go swobodnie wykorzystywać, publikować " + "i rozpowszechniać. Jeśli utwór opatrzony jest " + "dodatkowymi materiałami (przypisy, motywy literackie " + "etc.), które podlegają prawu autorskiemu, to te " + "dodatkowe materiały udostępnione są na licencji " + "Creative Commons Uznanie Autorstwa – Na Tych Samych " + "Warunkach 3.0 PL " + "(http://creativecommons.org/licenses/by-sa/3.0/)" + ) + } + license_description = { + "pol": "Ten utwór jest udostępniony na licencji {meta.license_description}: \n{meta.license}", + } + + def __init__(self): + self.fragments = { + None: TxtFragment(), + 'header': TxtFragment() + } + self.current_fragments = [self.fragments[None]] + + def enter_fragment(self, fragment): + self.current_fragments.append(self.fragments[fragment]) + + def exit_fragment(self): + self.current_fragments.pop() + + def push_text(self, text, prepared=False): + self.current_fragments[-1].push_text(text, prepared=prepared) + + def push_margin(self, margin): + self.current_fragments[-1].push_margin(margin) + + def push_legacy_margin(self, margin, where=None): + self.current_fragments[-1].push_legacy_margin(margin) + + def build(self, document, raw_text=False): + document.tree.getroot().txt_build(self) + meta = document.meta + + self.enter_fragment('header') + if meta.translators: + self.push_text("tłum. ", 'header') + for translator in meta.translators: + self.push_text(translator.readable()) + #builder.push_margin(2) + self.push_legacy_margin(1) + + if meta.isbn_txt: + #builder.push_margin(2) + self.push_legacy_margin(1) + isbn = meta.isbn_txt + if isbn.startswith(('ISBN-' , 'ISBN ')): + isbn = isbn[5:] + self.push_text('ISBN {isbn}'.format(isbn=isbn)) + #builder.push_margin(5) + + #builder.push_margin(4) + self.push_legacy_margin(1) + self.exit_fragment() + + text = ''.join(self.fragments['header'].pieces) + ''.join(self.fragments[None].pieces) + + if raw_text: + result = text + else: + if meta.license: + license_description = self.license_description['pol'].format(meta=meta) + else: + license_description = self.default_license_description['pol'] + + if meta.source_name: + source = "\n\nTekst opracowany na podstawie: " + meta.source_name + else: + source = '' + + contributors = ', '.join( + person.readable() + for person in sorted(set( + p for p in ( + meta.technical_editors + meta.editors + ) if p)) + ) + if contributors: + contributors = ( + "\n\nOpracowanie redakcyjne i przypisy: %s." + % contributors + ) + + funders = ', '.join(meta.funders) + if funders: + funders = u"\n\nPublikację wsparli i wsparły: %s." % funders + + isbn = getattr(meta, 'isbn_txt', None) + if isbn: + isbn = '\n\n' + isbn + else: + isbn = '' + + result = TEMPLATE % { + "text": text, + "description": meta.description, + "url": meta.url, + "license_description": license_description, + "source": source, + "contributors": contributors, + "funders": funders, + "publisher": '\n\nWydawca: ' + ', '.join(meta.publisher), + "isbn": isbn, + } + + result = '\r\n'.join(result.splitlines()) + '\r\n' + return OutputFile.from_bytes(result.encode('utf-8')) diff --git a/src/librarian/command_line.py b/src/librarian/command_line.py new file mode 100644 index 0000000..91196f1 --- /dev/null +++ b/src/librarian/command_line.py @@ -0,0 +1,43 @@ +import argparse +import os.path +from .builders import builders +from .document import WLDocument + + +def main(*args, **kwargs): + parser = argparse.ArgumentParser(description="PARSER DESCRIPTION") + + parser.add_argument( + 'builder', + choices=[b.identifier for b in builders], + help="Builder" + ) + parser.add_argument('input_file') + parser.add_argument( + '-o', '--output-file', metavar='FILE', + help='specifies the output file' + ) + parser.add_argument( + '-O', '--output-dir', metavar='DIR', + help='specifies the directory for output' + ) + + args = parser.parse_args() + + if args.output_file: + output_file_path = args.output_file + else: + output_file_path = '.'.join(( + os.path.splitext(args.input_file)[0], + builder.file_extension + )) + if args.output_dir: + output_file_path = '/'.join(( + args.output_dir, + output_file_path.rsplit('/', 1)[-1] + )) + + document = WLDocument(filename=args.input_file) + output = document.build(args.builder) + with open(output_file_path, 'wb') as f: + f.write(output.get_bytes()) diff --git a/src/librarian/dcparser.py b/src/librarian/dcparser.py index 432b580..2072695 100644 --- a/src/librarian/dcparser.py +++ b/src/librarian/dcparser.py @@ -212,7 +212,7 @@ class Field(object): % (self.uri, e.message) ) - def validate(self, fdict, fallbacks=None, strict=False): + def validate(self, fdict, fallbacks=None, strict=False, validate_required=True): if fallbacks is None: fallbacks = {} if self.uri not in fdict: @@ -227,8 +227,10 @@ class Field(object): f = [fallbacks[self.salias]] else: f = self.default - else: + elif validate_required: raise ValidationError("Required field %s not found" % self.uri) + else: + return None else: f = fdict[self.uri] @@ -363,7 +365,7 @@ class WorkInfo(six.with_metaclass(DCInfo, object)): return cls(desc.attrib, field_dict, *args, **kwargs) - def __init__(self, rdf_attrs, dc_fields, fallbacks=None, strict=False): + def __init__(self, rdf_attrs, dc_fields, fallbacks=None, strict=False, validate_required=True): """ rdf_attrs should be a dictionary-like object with any attributes of the RDF:Description. @@ -376,7 +378,7 @@ class WorkInfo(six.with_metaclass(DCInfo, object)): for field in self.FIELDS: value = field.validate(dc_fields, fallbacks=fallbacks, - strict=strict) + strict=strict, validate_required=validate_required) setattr(self, 'prop_' + field.name, value) self.fmap[field.name] = field if field.salias: diff --git a/src/librarian/document.py b/src/librarian/document.py new file mode 100644 index 0000000..1bd249d --- /dev/null +++ b/src/librarian/document.py @@ -0,0 +1,24 @@ +from lxml import etree +from .builders import get_builder_class +from .parser import parser +from . import dcparser + + +class WLDocument: + def __init__(self, tree=None, filename=None): + if filename is not None: + tree = etree.parse(filename, parser=parser) + self.tree = tree + tree.getroot().document = self + self.base_meta = dcparser.BookInfo({}, {}, validate_required=False) + + @property + def meta(self): + # Allow metadata of the master element as document meta. + #master = self.tree.getroot()[-1] + return self.tree.getroot().meta + return master.meta + + def build(self, builder_id, **kwargs): + return get_builder_class(builder_id)().build(self, **kwargs) + diff --git a/src/librarian/elements/__init__.py b/src/librarian/elements/__init__.py new file mode 100644 index 0000000..512b4ef --- /dev/null +++ b/src/librarian/elements/__init__.py @@ -0,0 +1,115 @@ +from lxml import etree +from . import (blocks, comments, drama, figures, footnotes, front, headers, + masters, paragraphs, poetry, root, separators, styles, themes) + + +WL_ELEMENTS = { + 'meta': etree.ElementBase, + 'coverClass': etree.ElementBase, + "developmentStage": etree.ElementBase, + "coverBarColor": etree.ElementBase, + "coverBoxPosition": etree.ElementBase, + "coverLogoUrl": etree.ElementBase, + + "utwor": root.Utwor, + "dramat_wierszowany_l": masters.Master, + "dramat_wierszowany_lp": masters.Master, + "dramat_wspolczesny": masters.Master, + "liryka_l": masters.Master, + "liryka_lp": masters.Master, + "opowiadanie": masters.Master, + "powiesc": masters.Master, + + "autor_utworu": front.AutorUtworu, + "dzielo_nadrzedne": front.DzieloNadrzedne, + "nazwa_utworu": front.NazwaUtworu, + "podtytul": front.Podtytul, + + "lista_osob": drama.ListaOsob, + "lista_osoba": drama.ListaOsoba, + "naglowek_osoba": drama.NaglowekOsoba, + "osoba": drama.Osoba, + + "dlugi_cytat": blocks.DlugiCytat, + "poezja_cyt": blocks.PoezjaCyt, + "dlugi_cyt": blocks.DlugiCytat, ### ??? + + "slowo_obce": styles.SlowoObce, + "tytul_dziela": styles.TytulDziela, + "wyroznienie": styles.Wyroznienie, + + "akap": paragraphs.Akap, + "akap_cd": paragraphs.Akap, + "akap_dialog": paragraphs.Akap, + + "motto_podpis": front.MottoPodpis, + + "strofa": poetry.Strofa, + + "motto": front.Motto, + + "didaskalia": drama.Didaskalia, + "kwestia": drama.Kwestia, + "didask_tekst": drama.DidaskTekst, + + "dedykacja": paragraphs.Akap, + "miejsce_czas": paragraphs.Akap, + + "uwaga": comments.Uwaga, + + "wers": poetry.Wers, + "wers_wciety": poetry.WersWciety, + "wers_cd": poetry.WersCd, + "wers_akap": poetry.Wers, + "zastepnik_wersu": poetry.ZastepnikWersu, + "wers_do_prawej": poetry.Wers, + + "pa": footnotes.Footnote, + "pe": footnotes.Footnote, + "pr": footnotes.Footnote, + "pt": footnotes.Footnote, + + "begin": themes.Begin, + "end": themes.End, + "motyw": themes.Motyw, + + "nota": blocks.Nota, + + "nota_red": comments.Abstrakt, + "extra": comments.Abstrakt, + "abstrakt": comments.Abstrakt, + + "naglowek_czesc": headers.NaglowekCzesc, + "naglowek_akt": headers.NaglowekCzesc, + "naglowek_scena": headers.NaglowekRozdzial, + "naglowek_rozdzial": headers.NaglowekRozdzial, + "naglowek_podrozdzial": headers.NaglowekPodrozdzial, + "srodtytul": headers.NaglowekCzesc, + + "naglowek_listy": drama.NaglowekListy, + + "sekcja_asterysk": separators.SekcjaAsterysk, + "sekcja_swiatlo": separators.SekcjaSwiatlo, + "separator_linia": separators.SeparatorLinia, + + "wieksze_odstepy": styles.Wyroznienie, + "mat": styles.Wyroznienie, + "www": styles.Wyroznienie, + "indeks_dolny": styles.Wyroznienie, + + "tabela": paragraphs.Akap, + "tabelka": paragraphs.Akap, + "wiersz": paragraphs.Akap, + "kol": paragraphs.Akap, + + "ilustr": figures.Ilustr, + +# sklodowska-badanie-cial-radioaktywnych.xml + "mrow": paragraphs.Akap, + "mi": paragraphs.Akap, + "mo": paragraphs.Akap, + "msup": paragraphs.Akap, + "mn": paragraphs.Akap, + "mfrac": paragraphs.Akap, + "mfenced": paragraphs.Akap, +} diff --git a/src/librarian/elements/base.py b/src/librarian/elements/base.py new file mode 100644 index 0000000..fb5e3b1 --- /dev/null +++ b/src/librarian/elements/base.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 + +import re +from lxml import etree +from librarian import dcparser, RDFNS + + +class WLElement(etree.ElementBase): + TXT_TOP_MARGIN = 0 + TXT_BOTTOM_MARGIN = 0 + TXT_PREFIX = "" + TXT_SUFFIX = "" + + HTML_TAG = None + HTML_ATTR = {} + HTML_CLASS = None + HTML_SECTION = False + + CAN_HAVE_TEXT = True + STRIP = False + + text_substitutions = [ + (u'---', u'—'), + (u'--', u'–'), + (u'...', u'…'), + (u',,', u'„'), + (u'"', u'”'), + ('\ufeff', ''), + ] + + @property + def meta_object(self): + if not hasattr(self, '_meta_object'): + elem = self.find(RDFNS('RDF')) + if elem is not None: + self._meta_object = dcparser.BookInfo.from_element(elem) + else: + self._meta_object = None + return self._meta_object + + @property + def meta(self): + if self.meta_object is not None: + return self.meta_object + else: + if self.getparent() is not None: + return self.getparent().meta + else: + return self.document.base_meta + + def normalize_text(self, text): + text = text or '' + for e, s in self.text_substitutions: + text = text.replace(e, s) + text = re.sub(r'\s+', ' ', text) + return text + + def _build_inner(self, builder, build_method): + child_count = len(self) + if self.CAN_HAVE_TEXT and self.text: + text = self.normalize_text(self.text) + if self.STRIP: + text = text.lstrip() + if not child_count: + text = text.rstrip() + builder.push_text(text) + for i, child in enumerate(self): + if isinstance(child, WLElement): + getattr(child, build_method)(builder) + if self.CAN_HAVE_TEXT and child.tail: + text = self.normalize_text(child.tail) + if self.STRIP and i == child_count - 1: + text = text.rstrip() + builder.push_text(text) + + def _txt_build_inner(self, builder): + self._build_inner(builder, 'txt_build') + + def txt_build(self, builder): + if hasattr(self, 'TXT_LEGACY_TOP_MARGIN'): + builder.push_legacy_margin(self.TXT_LEGACY_TOP_MARGIN) + else: + builder.push_margin(self.TXT_TOP_MARGIN) + builder.push_text(self.TXT_PREFIX, True) + self._txt_build_inner(builder) + builder.push_text(self.TXT_SUFFIX, True) + if hasattr(self, 'TXT_LEGACY_BOTTOM_MARGIN'): + builder.push_legacy_margin(self.TXT_LEGACY_BOTTOM_MARGIN) + else: + builder.push_margin(self.TXT_BOTTOM_MARGIN) + + def _html_build_inner(self, builder): + self._build_inner(builder, 'html_build') + + def get_html_attr(self, builder): + attr = self.HTML_ATTR.copy() + if self.HTML_CLASS: + attr['class'] = self.HTML_CLASS + # always copy the id attribute (?) + if self.attrib.get('id'): + attr['id'] = self.attrib['id'] + return attr + + def html_build(self, builder): + if self.HTML_SECTION: + builder.start_element( + 'a', {"name": "f18", "class": "target"} + ) + builder.push_text(" ") + builder.end_element() + + builder.start_element( + "a", {"href": "#f18", "class": "anchor"} + ) + builder.push_text("18") + builder.end_element() + + + if self.HTML_TAG: + builder.start_element( + self.HTML_TAG, + self.get_html_attr(builder), + ) + + if self.HTML_SECTION: + builder.start_element( + "a", {"name": "sec34"} + ) + builder.end_element() + + self._html_build_inner(builder) + if self.HTML_TAG: + builder.end_element() + + def sanitize(self): + # TODO: Remove insanity here. + for e in self: + if isinstance(e, WLElement): + e.sanitize() diff --git a/src/librarian/elements/blocks/__init__.py b/src/librarian/elements/blocks/__init__.py new file mode 100644 index 0000000..075493a --- /dev/null +++ b/src/librarian/elements/blocks/__init__.py @@ -0,0 +1,3 @@ +from .dlugi_cytat import DlugiCytat +from .nota import Nota +from .poezja_cyt import PoezjaCyt diff --git a/src/librarian/elements/blocks/dlugi_cytat.py b/src/librarian/elements/blocks/dlugi_cytat.py new file mode 100644 index 0000000..8137867 --- /dev/null +++ b/src/librarian/elements/blocks/dlugi_cytat.py @@ -0,0 +1,10 @@ +from ..base import WLElement + + +class DlugiCytat(WLElement): + CAN_HAVE_TEXT = False + + TXT_TOP_MARGIN = 3 + TXT_BOTTOM_MARGIN = 2 + TXT_LEGACY_TOP_MARGIN = 1 + TXT_LEGACY_BOTTOM_MARGIN = 0 diff --git a/src/librarian/elements/blocks/nota.py b/src/librarian/elements/blocks/nota.py new file mode 100644 index 0000000..c38021d --- /dev/null +++ b/src/librarian/elements/blocks/nota.py @@ -0,0 +1,5 @@ +from ..base import WLElement + + +class Nota(WLElement): + CAN_HAVE_TEXT = False diff --git a/src/librarian/elements/blocks/poezja_cyt.py b/src/librarian/elements/blocks/poezja_cyt.py new file mode 100644 index 0000000..3349567 --- /dev/null +++ b/src/librarian/elements/blocks/poezja_cyt.py @@ -0,0 +1,10 @@ +from ..base import WLElement + + +class PoezjaCyt(WLElement): + CAN_HAVE_TEXT = False + + TXT_TOP_MARGIN = 3 + TXT_BOTTOM_MARGIN = 3 + TXT_LEGACY_TOP_MARGIN = 1 + TXT_LEGACY_BOTTOM_MARGIN = 0 diff --git a/src/librarian/elements/comments/__init__.py b/src/librarian/elements/comments/__init__.py new file mode 100644 index 0000000..9073a91 --- /dev/null +++ b/src/librarian/elements/comments/__init__.py @@ -0,0 +1,2 @@ +from .abstrakt import Abstrakt +from .uwaga import Uwaga diff --git a/src/librarian/elements/comments/abstrakt.py b/src/librarian/elements/comments/abstrakt.py new file mode 100644 index 0000000..9b43dc3 --- /dev/null +++ b/src/librarian/elements/comments/abstrakt.py @@ -0,0 +1,9 @@ +from ..base import WLElement + + +class Abstrakt(WLElement): + def txt_build(self, builder): + pass + + def html_build(self, builder): + pass diff --git a/src/librarian/elements/comments/uwaga.py b/src/librarian/elements/comments/uwaga.py new file mode 100644 index 0000000..5a5e26c --- /dev/null +++ b/src/librarian/elements/comments/uwaga.py @@ -0,0 +1,7 @@ +from ..base import WLElement + + +class Uwaga(WLElement): + def txt_build(self, builder): + pass + diff --git a/src/librarian/elements/drama/__init__.py b/src/librarian/elements/drama/__init__.py new file mode 100644 index 0000000..1c88a6a --- /dev/null +++ b/src/librarian/elements/drama/__init__.py @@ -0,0 +1,8 @@ +from .didaskalia import Didaskalia +from .didask_tekst import DidaskTekst +from .kwestia import Kwestia +from .lista_osoba import ListaOsoba +from .lista_osob import ListaOsob +from .naglowek_listy import NaglowekListy +from .naglowek_osoba import NaglowekOsoba +from .osoba import Osoba diff --git a/src/librarian/elements/drama/didask_tekst.py b/src/librarian/elements/drama/didask_tekst.py new file mode 100644 index 0000000..7b6ae54 --- /dev/null +++ b/src/librarian/elements/drama/didask_tekst.py @@ -0,0 +1,6 @@ +from ..base import WLElement + + +class DidaskTekst(WLElement): + TXT_PREFIX = "/ " + TXT_SUFFIX = " /" diff --git a/src/librarian/elements/drama/didaskalia.py b/src/librarian/elements/drama/didaskalia.py new file mode 100644 index 0000000..cdd7900 --- /dev/null +++ b/src/librarian/elements/drama/didaskalia.py @@ -0,0 +1,11 @@ +from ..base import WLElement + + +class Didaskalia(WLElement): + TXT_TOP_PARGIN = 2 + TXT_BOTTOM_MARGIN = 2 + TXT_LEGACY_TOP_MARGIN = 2 + TXT_LEGACY_BOTTOM_MARGIN = 0 + TXT_PREFIX = "/ " + TXT_SUFFIX = " /" + diff --git a/src/librarian/elements/drama/kwestia.py b/src/librarian/elements/drama/kwestia.py new file mode 100644 index 0000000..0bb5f3d --- /dev/null +++ b/src/librarian/elements/drama/kwestia.py @@ -0,0 +1,6 @@ +from ..base import WLElement + + +class Kwestia(WLElement): + CAN_HAVE_TEXT = False + diff --git a/src/librarian/elements/drama/lista_osob.py b/src/librarian/elements/drama/lista_osob.py new file mode 100644 index 0000000..ec18472 --- /dev/null +++ b/src/librarian/elements/drama/lista_osob.py @@ -0,0 +1,11 @@ +from ..base import WLElement + + +class ListaOsob(WLElement): + CAN_HAVE_TEXT = False + + TXT_TOP_MARGIN = 3 + TXT_BOTTOM_MARGIN = 3 + TXT_LEGACY_TOP_MARGIN = 3 + TXT_LEGACY_BOTTOM_MARGIN = 1 + diff --git a/src/librarian/elements/drama/lista_osoba.py b/src/librarian/elements/drama/lista_osoba.py new file mode 100644 index 0000000..5759c50 --- /dev/null +++ b/src/librarian/elements/drama/lista_osoba.py @@ -0,0 +1,10 @@ +from ..base import WLElement + + +class ListaOsoba(WLElement): + TXT_TOP_MARGIN = 1 + TXT_BOTTOM_MARGIN = 1 + TXT_LEGACY_TOP_MARGIN = 1 + TXT_LEGACY_BOTTOM_MARGIN = 0 + TXT_PREFIX = " * " + diff --git a/src/librarian/elements/drama/naglowek_listy.py b/src/librarian/elements/drama/naglowek_listy.py new file mode 100644 index 0000000..398a055 --- /dev/null +++ b/src/librarian/elements/drama/naglowek_listy.py @@ -0,0 +1,5 @@ +from ..base import WLElement + + +class NaglowekListy(WLElement): + pass diff --git a/src/librarian/elements/drama/naglowek_osoba.py b/src/librarian/elements/drama/naglowek_osoba.py new file mode 100644 index 0000000..076936b --- /dev/null +++ b/src/librarian/elements/drama/naglowek_osoba.py @@ -0,0 +1,9 @@ +from ..base import WLElement + + +class NaglowekOsoba(WLElement): + TXT_TOP_MARGIN = 3 + TXT_BOTTOM_MARGIN = 2 + TXT_LEGACY_TOP_MARGIN = 3 + TXT_LEGACY_BOTTOM_MARGIN = 0 + diff --git a/src/librarian/elements/drama/osoba.py b/src/librarian/elements/drama/osoba.py new file mode 100644 index 0000000..fea8d60 --- /dev/null +++ b/src/librarian/elements/drama/osoba.py @@ -0,0 +1,6 @@ +from ..base import WLElement + + +class Osoba(WLElement): + pass + diff --git a/src/librarian/elements/figures/__init__.py b/src/librarian/elements/figures/__init__.py new file mode 100644 index 0000000..a0c464a --- /dev/null +++ b/src/librarian/elements/figures/__init__.py @@ -0,0 +1 @@ +from .ilustr import Ilustr diff --git a/src/librarian/elements/figures/ilustr.py b/src/librarian/elements/figures/ilustr.py new file mode 100644 index 0000000..bd51453 --- /dev/null +++ b/src/librarian/elements/figures/ilustr.py @@ -0,0 +1,10 @@ +from ..base import WLElement + + +class Ilustr(WLElement): + HTML_TAG = 'img' + + def get_html_attr(self, builder): + return { + 'src': builder.image_location + self.attrib['src'] + } diff --git a/src/librarian/elements/footnotes/__init__.py b/src/librarian/elements/footnotes/__init__.py new file mode 100644 index 0000000..eefe9db --- /dev/null +++ b/src/librarian/elements/footnotes/__init__.py @@ -0,0 +1,7 @@ +from ..base import WLElement + + +class Footnote(WLElement): + def txt_build(self, builder): + pass + diff --git a/src/librarian/elements/front/__init__.py b/src/librarian/elements/front/__init__.py new file mode 100644 index 0000000..44e7a99 --- /dev/null +++ b/src/librarian/elements/front/__init__.py @@ -0,0 +1,6 @@ +from .autor_utworu import AutorUtworu +from .dzielo_nadrzedne import DzieloNadrzedne +from .motto_podpis import MottoPodpis +from .motto import Motto +from .nazwa_utworu import NazwaUtworu +from .podtytul import Podtytul diff --git a/src/librarian/elements/front/autor_utworu.py b/src/librarian/elements/front/autor_utworu.py new file mode 100644 index 0000000..fd6b2e8 --- /dev/null +++ b/src/librarian/elements/front/autor_utworu.py @@ -0,0 +1,8 @@ +from .base import HeaderElement + + +class AutorUtworu(HeaderElement): + TXT_BOTTOM_MARGIN = 2 + TXT_LEGACY_BOTTOM_MARGIN = 2 + + HTML_CLASS = 'author' diff --git a/src/librarian/elements/front/base.py b/src/librarian/elements/front/base.py new file mode 100644 index 0000000..9e961df --- /dev/null +++ b/src/librarian/elements/front/base.py @@ -0,0 +1,15 @@ +from ..base import WLElement + + +class HeaderElement(WLElement): + HTML_TAG = 'span' + + def txt_build(self, builder): + builder.enter_fragment('header') + super(HeaderElement, self).txt_build(builder) + builder.exit_fragment() + + def html_build(self, builder): + builder.enter_fragment('header') + super(HeaderElement, self).html_build(builder) + builder.exit_fragment() diff --git a/src/librarian/elements/front/dzielo_nadrzedne.py b/src/librarian/elements/front/dzielo_nadrzedne.py new file mode 100644 index 0000000..c53b3ad --- /dev/null +++ b/src/librarian/elements/front/dzielo_nadrzedne.py @@ -0,0 +1,6 @@ +from .base import HeaderElement + + +class DzieloNadrzedne(HeaderElement): + TXT_BOTTOM_MARGIN = 1 + TXT_LEGACY_BOTTOM_MARGIN = 1 diff --git a/src/librarian/elements/front/motto.py b/src/librarian/elements/front/motto.py new file mode 100644 index 0000000..fd81220 --- /dev/null +++ b/src/librarian/elements/front/motto.py @@ -0,0 +1,6 @@ +from ..base import WLElement + + +class Motto(WLElement): + TXT_LEGACY_TOP_MARGIN = 4 + TXT_LEGACY_BOTTOM_MARGIN = 2 diff --git a/src/librarian/elements/front/motto_podpis.py b/src/librarian/elements/front/motto_podpis.py new file mode 100644 index 0000000..decbff3 --- /dev/null +++ b/src/librarian/elements/front/motto_podpis.py @@ -0,0 +1,5 @@ +from ..base import WLElement + + +class MottoPodpis(WLElement): + pass diff --git a/src/librarian/elements/front/nazwa_utworu.py b/src/librarian/elements/front/nazwa_utworu.py new file mode 100644 index 0000000..55ef78b --- /dev/null +++ b/src/librarian/elements/front/nazwa_utworu.py @@ -0,0 +1,9 @@ +from .base import HeaderElement + + +class NazwaUtworu(HeaderElement): + TXT_BOTTOM_MARGIN = 1 + TXT_LEGACY_BOTTOM_MARGIN = 1 + + HTML_TAG = 'span' + HTML_CLASS = 'title' diff --git a/src/librarian/elements/front/podtytul.py b/src/librarian/elements/front/podtytul.py new file mode 100644 index 0000000..4431bc2 --- /dev/null +++ b/src/librarian/elements/front/podtytul.py @@ -0,0 +1,8 @@ +from .base import HeaderElement + + +class Podtytul(HeaderElement): + TXT_BOTTOM_MARGIN = 1 + TXT_LEGACY_BOTTOM_MARGIN = 1 + + HTML_CLASS = 'subtitle' diff --git a/src/librarian/elements/headers/__init__.py b/src/librarian/elements/headers/__init__.py new file mode 100644 index 0000000..9ddf8d5 --- /dev/null +++ b/src/librarian/elements/headers/__init__.py @@ -0,0 +1,3 @@ +from .naglowek_czesc import NaglowekCzesc +from .naglowek_podrozdzial import NaglowekPodrozdzial +from .naglowek_rozdzial import NaglowekRozdzial diff --git a/src/librarian/elements/headers/naglowek_czesc.py b/src/librarian/elements/headers/naglowek_czesc.py new file mode 100644 index 0000000..7b0781a --- /dev/null +++ b/src/librarian/elements/headers/naglowek_czesc.py @@ -0,0 +1,8 @@ +from ..base import WLElement + + +class NaglowekCzesc(WLElement): + TXT_TOP_MARGIN = 5 + TXT_BOTTOM_MARGIN = 2 + TXT_LEGACY_TOP_MARGIN = 5 + TXT_LEGACY_BOTTOM_MARGIN = 0 diff --git a/src/librarian/elements/headers/naglowek_podrozdzial.py b/src/librarian/elements/headers/naglowek_podrozdzial.py new file mode 100644 index 0000000..6d3f85f --- /dev/null +++ b/src/librarian/elements/headers/naglowek_podrozdzial.py @@ -0,0 +1,8 @@ +from ..base import WLElement + + +class NaglowekPodrozdzial(WLElement): + TXT_TOP_MARGIN = 3 + TXT_BOTTOM_MARGIN = 2 + TXT_LEGACY_TOP_MARGIN = 3 + TXT_LEGACY_BOTTOM_MARGIN = 0 diff --git a/src/librarian/elements/headers/naglowek_rozdzial.py b/src/librarian/elements/headers/naglowek_rozdzial.py new file mode 100644 index 0000000..ded615f --- /dev/null +++ b/src/librarian/elements/headers/naglowek_rozdzial.py @@ -0,0 +1,10 @@ +from ..base import WLElement + + +class NaglowekRozdzial(WLElement): + TXT_TOP_MARGIN = 4 + TXT_BOTTOM_MARGIN = 2 + TXT_LEGACY_TOP_MARGIN = 4 + TXT_LEGACY_BOTTOM_MARGIN = 0 + + HTML_TAG = 'h3' diff --git a/src/librarian/elements/masters/__init__.py b/src/librarian/elements/masters/__init__.py new file mode 100644 index 0000000..2fdb6a6 --- /dev/null +++ b/src/librarian/elements/masters/__init__.py @@ -0,0 +1,7 @@ +from ..base import WLElement + + +class Master(WLElement): + CAN_HAVE_TEXT = False + + TXT_LEGACY_BOTTOM_MARGIN = 2 diff --git a/src/librarian/elements/paragraphs/__init__.py b/src/librarian/elements/paragraphs/__init__.py new file mode 100644 index 0000000..d6c8438 --- /dev/null +++ b/src/librarian/elements/paragraphs/__init__.py @@ -0,0 +1 @@ +from .akap import Akap diff --git a/src/librarian/elements/paragraphs/akap.py b/src/librarian/elements/paragraphs/akap.py new file mode 100644 index 0000000..836671b --- /dev/null +++ b/src/librarian/elements/paragraphs/akap.py @@ -0,0 +1,15 @@ +from ..base import WLElement + + +class Akap(WLElement): + STRIP = True + + TXT_TOP_MARGIN = 2 + TXT_BOTTOM_MARGIN = 2 + TXT_LEGACY_TOP_MARGIN = 2 + TXT_LEGACY_BOTTOM_MARGIN = 0 + + HTML_TAG = 'p' + HTML_CLASS = 'paragraph' + + HTML_SECTION = True diff --git a/src/librarian/elements/poetry/__init__.py b/src/librarian/elements/poetry/__init__.py new file mode 100644 index 0000000..80fdc4f --- /dev/null +++ b/src/librarian/elements/poetry/__init__.py @@ -0,0 +1,5 @@ +from .strofa import Strofa +from .wers_cd import WersCd +from .wers import Wers +from .wers_wciety import WersWciety +from .zastepnik_wersu import ZastepnikWersu diff --git a/src/librarian/elements/poetry/strofa.py b/src/librarian/elements/poetry/strofa.py new file mode 100644 index 0000000..2d3a4c9 --- /dev/null +++ b/src/librarian/elements/poetry/strofa.py @@ -0,0 +1,50 @@ +from copy import copy +from ..base import WLElement +from .wers import Wers + + +class Strofa(WLElement): + TXT_TOP_MARGIN = 2 + TXT_BOTTOM_MARGIN = 2 + TXT_LEGACY_TOP_MARGIN = 1 + TXT_LEGACY_BOTTOM_MARGIN = 0 + + def get_verses(self): + from librarian.parser import parser + + verses = [ + parser.makeelement('wers') + ] + if self.text: + # Before any tags. These are text-only verses. + pieces = self.text.split('/') + for piece in pieces[:-1]: + verses[-1].text = piece + verses.append(parser.makeelement('wers')) + verses[-1].text = pieces[-1] + + for child in self: + if child.tail: + pieces = child.tail.split('/') + child_copy = copy(child) + child_copy.tail = pieces[0] + verses[-1].append(child_copy) + + for piece in pieces[1:]: + verses.append(parser.makeelement('wers')) + verses[-1].text = piece + + else: + verses[-1].append(child) + + for verse in verses: + if len(verse) == 1 and isinstance(verse[0], Wers): + assert not (verse.text or '').strip() + assert not (verse[0].tail or '').strip() + yield verse[0] + else: + yield verse + + def _build_inner(self, builder, build_method): + for child in self.get_verses(): + getattr(child, build_method)(builder) diff --git a/src/librarian/elements/poetry/wers.py b/src/librarian/elements/poetry/wers.py new file mode 100644 index 0000000..e164b1d --- /dev/null +++ b/src/librarian/elements/poetry/wers.py @@ -0,0 +1,13 @@ +from ..base import WLElement + + +class Wers(WLElement): + STRIP = True + + TXT_TOP_MARGIN = 1 + TXT_BOTTOM_MARGIN = 1 + TXT_LEGACY_TOP_MARGIN = 1 + TXT_LEGACY_BOTTOM_MARGIN = 0 + + HTML_TAG = 'div' + HTML_ATTRIB = {"class": "verse"} diff --git a/src/librarian/elements/poetry/wers_cd.py b/src/librarian/elements/poetry/wers_cd.py new file mode 100644 index 0000000..7a14938 --- /dev/null +++ b/src/librarian/elements/poetry/wers_cd.py @@ -0,0 +1,6 @@ +from .wers import Wers + +class WersCd(Wers): + def _txt_build_inner(self, builder): + builder.push_text(' ' * 24, prepared=True) + super(WersCd, self)._txt_build_inner(builder) diff --git a/src/librarian/elements/poetry/wers_wciety.py b/src/librarian/elements/poetry/wers_wciety.py new file mode 100644 index 0000000..3e9bb6f --- /dev/null +++ b/src/librarian/elements/poetry/wers_wciety.py @@ -0,0 +1,16 @@ +from .wers import Wers + + +class WersWciety(Wers): + @property + def typ(self): + ## Temporary legacy compatibility fix. + return 2 if 'typ' in self.attrib else 1 + + v = self.attrib.get('typ') + return int(v) if v else 1 + + def _txt_build_inner(self, builder): + builder.push_text(' ' * self.typ, prepared=True) + super(WersWciety, self)._txt_build_inner(builder) + diff --git a/src/librarian/elements/poetry/zastepnik_wersu.py b/src/librarian/elements/poetry/zastepnik_wersu.py new file mode 100644 index 0000000..edee0f7 --- /dev/null +++ b/src/librarian/elements/poetry/zastepnik_wersu.py @@ -0,0 +1,5 @@ +from ..base import WLElement + + +class ZastepnikWersu(WLElement): + pass diff --git a/src/librarian/elements/root/__init__.py b/src/librarian/elements/root/__init__.py new file mode 100644 index 0000000..a8cf82d --- /dev/null +++ b/src/librarian/elements/root/__init__.py @@ -0,0 +1,19 @@ +from ..base import WLElement +from ..masters import Master + + +class Utwor(WLElement): + CAN_HAVE_TEXT = False + + @property + def meta(self): + if self.meta_object is not None: + return self.meta_object + else: + # Deprecated: allow RDF record in master. + for c in self: + if isinstance(c, Master) and c.meta_object is not None: + return c.meta_object + # This should not generally happen. + if self.getparent() is not None: + return self.getparent().meta diff --git a/src/librarian/elements/separators/__init__.py b/src/librarian/elements/separators/__init__.py new file mode 100644 index 0000000..84e9784 --- /dev/null +++ b/src/librarian/elements/separators/__init__.py @@ -0,0 +1,3 @@ +from .sekcja_asterysk import SekcjaAsterysk +from .sekcja_swiatlo import SekcjaSwiatlo +from .separator_linia import SeparatorLinia diff --git a/src/librarian/elements/separators/sekcja_asterysk.py b/src/librarian/elements/separators/sekcja_asterysk.py new file mode 100644 index 0000000..c11b9d0 --- /dev/null +++ b/src/librarian/elements/separators/sekcja_asterysk.py @@ -0,0 +1,11 @@ +from ..base import WLElement + + +class SekcjaAsterysk(WLElement): + TXT_TOP_MARGIN = 2 + TXT_BOTTOM_MARGIN = 4 + TXT_LEGACY_TOP_MARGIN = 2 + TXT_LEGACY_BOTTOM_MARGIN = 2 + + def _txt_build_inner(self, builder): + builder.push_text('*') diff --git a/src/librarian/elements/separators/sekcja_swiatlo.py b/src/librarian/elements/separators/sekcja_swiatlo.py new file mode 100644 index 0000000..1526548 --- /dev/null +++ b/src/librarian/elements/separators/sekcja_swiatlo.py @@ -0,0 +1,7 @@ +from ..base import WLElement + + +class SekcjaSwiatlo(WLElement): + TXT_BOTTOM_MARGIN = 6 + TXT_LEGACY_BOTTOM_MARGIN = 4 + diff --git a/src/librarian/elements/separators/separator_linia.py b/src/librarian/elements/separators/separator_linia.py new file mode 100644 index 0000000..7587785 --- /dev/null +++ b/src/librarian/elements/separators/separator_linia.py @@ -0,0 +1,11 @@ +from ..base import WLElement + + +class SeparatorLinia(WLElement): + TXT_TOP_MARGIN = 4 + TXT_BOTTOM_MARGIN = 4 + TXT_LEGACY_TOP_MARGIN = 2 + TXT_LEGACY_BOTTOM_MARGIN = 2 + + def _txt_build_inner(self, builder): + builder.push_text('-' * 48) diff --git a/src/librarian/elements/styles/__init__.py b/src/librarian/elements/styles/__init__.py new file mode 100644 index 0000000..40afc01 --- /dev/null +++ b/src/librarian/elements/styles/__init__.py @@ -0,0 +1,3 @@ +from .slowo_obce import SlowoObce +from .tytul_dziela import TytulDziela +from .wyroznienie import Wyroznienie diff --git a/src/librarian/elements/styles/slowo_obce.py b/src/librarian/elements/styles/slowo_obce.py new file mode 100644 index 0000000..537f7c6 --- /dev/null +++ b/src/librarian/elements/styles/slowo_obce.py @@ -0,0 +1,5 @@ +from ..base import WLElement + + +class SlowoObce(WLElement): + pass diff --git a/src/librarian/elements/styles/tytul_dziela.py b/src/librarian/elements/styles/tytul_dziela.py new file mode 100644 index 0000000..b6c3662 --- /dev/null +++ b/src/librarian/elements/styles/tytul_dziela.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 +from ..base import WLElement + + +class TytulDziela(WLElement): + def normalize_text(self, text): + txt = super(TytulDziela, self).normalize_text(text) + if self.attrib.get('typ') == '1': + txt = '„{txt}”'.format(txt=txt) + return txt diff --git a/src/librarian/elements/styles/wyroznienie.py b/src/librarian/elements/styles/wyroznienie.py new file mode 100644 index 0000000..dce6936 --- /dev/null +++ b/src/librarian/elements/styles/wyroznienie.py @@ -0,0 +1,7 @@ +from ..base import WLElement + + +class Wyroznienie(WLElement): + TXT_PREFIX = "*" + TXT_SUFFIX = "*" + diff --git a/src/librarian/elements/themes/__init__.py b/src/librarian/elements/themes/__init__.py new file mode 100644 index 0000000..f2278e3 --- /dev/null +++ b/src/librarian/elements/themes/__init__.py @@ -0,0 +1,3 @@ +from .begin import Begin +from .end import End +from .motyw import Motyw diff --git a/src/librarian/elements/themes/begin.py b/src/librarian/elements/themes/begin.py new file mode 100644 index 0000000..2d7ddc4 --- /dev/null +++ b/src/librarian/elements/themes/begin.py @@ -0,0 +1,5 @@ +from ..base import WLElement + + +class Begin(WLElement): + pass diff --git a/src/librarian/elements/themes/end.py b/src/librarian/elements/themes/end.py new file mode 100644 index 0000000..a6eb9e0 --- /dev/null +++ b/src/librarian/elements/themes/end.py @@ -0,0 +1,5 @@ +from ..base import WLElement + + +class End(WLElement): + pass diff --git a/src/librarian/elements/themes/motyw.py b/src/librarian/elements/themes/motyw.py new file mode 100644 index 0000000..51042c6 --- /dev/null +++ b/src/librarian/elements/themes/motyw.py @@ -0,0 +1,14 @@ +from ..base import WLElement + + +class Motyw(WLElement): + def txt_build(self, builder): + pass + + + def feed_to(self, builder): + assert not len(self) + themes = [ + normalize_text(t.strip()) for t in self.text.split(',') + ] + builder.set_themes(self.attrib['id'], themes) diff --git a/src/librarian/parser.py b/src/librarian/parser.py index 2bb9509..3ae081b 100644 --- a/src/librarian/parser.py +++ b/src/librarian/parser.py @@ -19,7 +19,27 @@ import re import six +from .elements import WL_ELEMENTS + + +class WLElementLookup(etree.CustomElementClassLookup): + def lookup(self, node_type, document, namespace, name): + if node_type != 'element': + return + if namespace: + return + return WL_ELEMENTS[name] + + +parser = etree.XMLParser() +parser.set_element_class_lookup( + WLElementLookup() +) + + + class WLDocument(object): + """Legacy class, to be replaced with documents.WLDocument.""" LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE) provider = None diff --git a/tests/test_text.py b/tests/test_text.py index 14c728f..bdd3ded 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -6,25 +6,36 @@ from __future__ import unicode_literals from librarian import NoDublinCore -from librarian.parser import WLDocument +from librarian.parser import WLDocument as LegacyWLDocument +from librarian.document import WLDocument from nose.tools import * from .utils import get_fixture -def test_transform(): +def test_transform_legacy(): expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.txt') - text = WLDocument.from_file( + text = LegacyWLDocument.from_file( get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml') ).as_text().get_bytes() assert_equal(text, open(expected_output_file_path, 'rb').read()) +def test_transform(): + expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected.txt') + + text = WLDocument( + filename=get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml') + ).build('txt').get_bytes() + + assert_equal(text, open(expected_output_file_path, 'rb').read()) + + def test_transform_raw(): expected_output_file_path = get_fixture('text', 'asnyk_miedzy_nami_expected_raw.txt') - text = WLDocument.from_file( + text = LegacyWLDocument.from_file( get_fixture('text', 'miedzy-nami-nic-nie-bylo.xml') ).as_text(flags=['raw-text']).get_bytes() @@ -33,14 +44,14 @@ def test_transform_raw(): @raises(NoDublinCore) def test_no_dublincore(): - WLDocument.from_file( + LegacyWLDocument.from_file( get_fixture('text', 'asnyk_miedzy_nami_nodc.xml') ).as_text() def test_passing_parse_dublincore_to_transform(): """Passing parse_dublincore=False to the constructor omits DublinCore parsing.""" - WLDocument.from_file( + LegacyWLDocument.from_file( get_fixture('text', 'asnyk_miedzy_nami_nodc.xml'), parse_dublincore=False, ).as_text() diff --git a/tox.ini b/tox.ini index bdce7bd..48c35e3 100644 --- a/tox.ini +++ b/tox.ini @@ -1,7 +1,7 @@ [tox] envlist = clean, - py{27,35,36,37}, + py{27,35,36,37,38}, stats [testenv] -- 2.20.1