+ fn- [] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..54c0b24 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,9 @@ +[nosetests] +detailed-errors=1 +with-coverage=1 +cover-package=librarian +cover-erase=1 +with-doctest=1 +exclude= + formats + tests,test_html_annotations diff --git a/tests/test_html_annotations.py b/tests/test_html_annotations.py new file mode 100644 index 0000000..87e9b01 --- /dev/null +++ b/tests/test_html_annotations.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 +from __future__ import unicode_literals + +from StringIO import StringIO +import tempfile +from librarian.parser import WLDocument +from librarian.html import extract_annotations +from lxml import etree +from nose.tools import eq_ + + +def _test_annotation(expected, got, name): + assert got[0].startswith('anchor-'), "%s: Unexpected anchor: '%s', should begin with 'anchor-'" % (name, got[0]) + eq_(expected[0], got[1], "%s: Unexpected type, expected '%s', got '%s'" % (name, expected[0], got[1])) + eq_(expected[1], got[2], "%s: Unexpected qualifier, expected '%s', got '%s'" % (name, expected[1], got[2])) + eq_(expected[2], got[3], "%s: Unexpected text representation, expected '%s', got '%s'" % (name, expected[2], got[3])) + exp_html = '

' % (expected[0], expected[3]) + eq_(exp_html, got[4], "%s: Unexpected html representation, expected '%s', got '%s'" % (name, exp_html, got[4])) + + +def test_annotations(): + annotations = ( + + ('', ( + 'pe', + None, + '', + '

' + ), + 'Empty footnote'), + + ( + 'Definiendum --- definiens.', ( + 'pr', + None, + 'Definiendum \u2014 definiens.', + '

Definiendum \u2014 definiens.

' + ), + 'Plain footnote.'), + + ('Definiendum --- definiens.', ( + 'pt', + None, + 'Definiendum \u2014 definiens.', + '

Definiendum \u2014 definiens.

' + ), + 'Standard footnote.'), + + ('Definiendum (Åac.) --- definiens.', ( + 'pr', + 'Åac.', + 'Definiendum (Åac.) \u2014 definiens.', + '

Definiendum (Åac.) \u2014 definiens.

' + ), + 'Plain footnote with qualifier'), + + ('Definiendum (Åac.) --- definiens.', ( + 'pe', + 'Åac.', + 'Definiendum (Åac.) \u2014 definiens.', + '

Definiendum (Åac.) \u2014 definiens.

' + ), + 'Standard footnote with qualifier.'), + + (' Definiendum (daw.) --- definiens.', ( + 'pt', + 'daw.', + 'Definiendum (daw.) \u2014 definiens.', + '

Definiendum (daw.) \u2014 definiens.

' + ), + 'Standard footnote with leading whitespace and qualifier.'), + + ('Definiendum (Åac.) --- definiens.', ( + 'pr', + 'Åac.', + 'Definiendum (Åac.) \u2014 definiens.', + '

Definiendum (Åac.) \u2014 definiens.

' + ), + 'Plain footnote with qualifier and some emphasis.'), + + ('Definiendum (Åac.) --- definiens.', ( + 'pe', + 'Åac.', + 'Definiendum (Åac.) \u2014 definiens.', + '

Definiendum (Åac.) \u2014 definiens.

' + ), + 'Standard footnote with qualifier and some emphasis.'), + + ) + + xml_src = ''' %s ''' % "".join( + t[0] for t in annotations) + html = WLDocument.from_string(xml_src, parse_dublincore=False).as_html().get_file() + res_annotations = list(extract_annotations(html)) + + for i, (src, expected, name) in enumerate(annotations): + yield _test_annotation, expected, res_annotations[i], name -- 2.20.1 From 14fbd48817ba27853a45164908d10ed679acac6e Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Mon, 6 Oct 2014 12:55:12 +0200 Subject: [PATCH 09/16] Disable test for unsupported behaviour in pictures. --- setup.cfg | 9 --------- tests/files/picture/angelus-novus.xml | 2 +- tests/test_picture.py | 3 ++- 3 files changed, 3 insertions(+), 11 deletions(-) delete mode 100644 setup.cfg diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 54c0b24..0000000 --- a/setup.cfg +++ /dev/null @@ -1,9 +0,0 @@ -[nosetests] -detailed-errors=1 -with-coverage=1 -cover-package=librarian -cover-erase=1 -with-doctest=1 -exclude= - formats - tests,test_html_annotations diff --git a/tests/files/picture/angelus-novus.xml b/tests/files/picture/angelus-novus.xml index 964faed..85fa554 100644 --- a/tests/files/picture/angelus-novus.xml +++ b/tests/files/picture/angelus-novus.xml @@ -36,7 +36,7 @@

+ diff --git a/tests/test_picture.py b/tests/test_picture.py index f64f624..1169f44 100644 --- a/tests/test_picture.py +++ b/tests/test_picture.py @@ -42,7 +42,8 @@ def test_wlpicture(): def test_picture_parts(): wlp = picture.WLPicture.from_file(open(get_fixture('picture', 'angelus-novus.xml'))) parts = list(wlp.partiter()) - assert len(parts) == 5, "there should be %d parts of the picture" % 5 + expect_parts = 4 + assert len(parts) == expect_parts, "there should be %d parts of the picture" % expect_parts motifs = set() names = set() -- 2.20.1 From ac7899052e10143e0548ad7de2f67a2c6ca2b50b Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Thu, 9 Oct 2014 12:28:03 +0200 Subject: [PATCH 10/16] Fix in extract_annotations --- librarian/html.py | 2 +- tests/test_html_annotations.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/librarian/html.py b/librarian/html.py index e084ed2..85b9003 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -298,7 +298,7 @@ def extract_annotations(html_path): parser = etree.HTMLParser(encoding='utf-8') tree = etree.parse(html_path, parser) footnotes = tree.find('//*[@id="footnotes"]') - re_qualifier = re.compile(ur'[^\u2014]+\s+$(.+)$\s+\u2014') + re_qualifier = re.compile(ur'[^\u2014]+\s+$([^$]+)\)\s+\u2014') if footnotes is not None: for footnote in footnotes.findall('div'): fn_type = footnote.get('class').split('-')[1] diff --git a/tests/test_html_annotations.py b/tests/test_html_annotations.py index 87e9b01..851c5b0 100644 --- a/tests/test_html_annotations.py +++ b/tests/test_html_annotations.py @@ -86,6 +86,14 @@ def test_annotations(): ), 'Standard footnote with qualifier and some emphasis.'), + ('Definiendum (Åac.) --- definens (some) --- more text.', ( + 'pe', + 'Åac.', + 'Definiendum (Åac.) \u2014 definiens (some) \u2014 more text.', + '

Definiendum (Åac.) \u2014 definiens (some) \u2014 more text.

', + ), + 'Footnote with a second parentheses and mdash.'), + ) xml_src = ''' %s ''' % "".join( -- 2.20.1 From a3b6840527ec52ce8b6d74819633d8c85e3973ba Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Fri, 10 Oct 2014 15:08:02 +0200 Subject: [PATCH 11/16] Fix test. --- tests/test_html_annotations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_html_annotations.py b/tests/test_html_annotations.py index 851c5b0..f2b9eee 100644 --- a/tests/test_html_annotations.py +++ b/tests/test_html_annotations.py @@ -86,7 +86,7 @@ def test_annotations(): ), 'Standard footnote with qualifier and some emphasis.'), - ('Definiendum (Åac.) --- definens (some) --- more text.', ( + ('Definiendum (Åac.) --- definiens (some) --- more text.', ( 'pe', 'Åac.', 'Definiendum (Åac.) \u2014 definiens (some) \u2014 more text.', -- 2.20.1 From a04f11baee3eb7d090867c2d5639a120ec3217b8 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Wed, 22 Oct 2014 10:51:59 +0200 Subject: [PATCH 12/16] Data for #3396: picture style, full creation date. --- librarian/dcparser.py | 2 +- librarian/picture.py | 1 + tests/test_html_annotations.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/librarian/dcparser.py b/librarian/dcparser.py index a33940d..f413fac 100644 --- a/librarian/dcparser.py +++ b/librarian/dcparser.py @@ -230,7 +230,7 @@ class WorkInfo(object): salias='funder', multiple=True, default=[]), Field( DCNS('contributor.thanks'), 'thanks', required=False), - Field( DCNS('date'), 'created_at', as_date), + Field( DCNS('date'), 'created_at'), Field( DCNS('date.pd'), 'released_to_public_domain_at', as_date, required=False), Field( DCNS('publisher'), 'publisher'), diff --git a/librarian/picture.py b/librarian/picture.py index 5d644d7..5a0c47b 100644 --- a/librarian/picture.py +++ b/librarian/picture.py @@ -32,6 +32,7 @@ class PictureInfo(WorkInfo): Field(DCNS('subject.period'), 'epochs', salias='epoch', multiple=True), Field(DCNS('subject.type'), 'kinds', salias='kind', multiple=True), Field(DCNS('subject.genre'), 'genres', salias='genre', multiple=True, required=False), + Field(DCNS('subject.style'), 'styles', salias='style', multiple=True, required=False), Field(DCNS('format.dimensions'), 'dimensions', required=False), Field(DCNS('format.checksum.sha1'), 'sha1', required=True), diff --git a/tests/test_html_annotations.py b/tests/test_html_annotations.py index f2b9eee..f269042 100644 --- a/tests/test_html_annotations.py +++ b/tests/test_html_annotations.py @@ -80,8 +80,8 @@ def test_annotations(): ('Definiendum (Åac.) --- definiens.', ( 'pe', - 'Åac.', - 'Definiendum (Åac.) \u2014 definiens.', + 'Åac.', + 'Definiendum (Åac.) \u2014 definiens.', '

Definiendum (Åac.) \u2014 definiens.

' ), 'Standard footnote with qualifier and some emphasis.'), -- 2.20.1 From a3be479506edf42dc58feb22b26e4f5da1e49edd Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Thu, 23 Oct 2014 13:51:06 +0200 Subject: [PATCH 13/16] html.extract_annotations: Allow multiple footnote qualifiers. Use only ones accepted by editors. --- librarian/fn_qualifiers.py | 198 +++++++++++++++++++++ librarian/html.py | 26 ++- scripts/fn_qualifiers_list_from_redmine.py | 35 ++++ tests/test_html_annotations.py | 26 ++- 4 files changed, 272 insertions(+), 13 deletions(-) create mode 100644 librarian/fn_qualifiers.py create mode 100644 scripts/fn_qualifiers_list_from_redmine.py diff --git a/librarian/fn_qualifiers.py b/librarian/fn_qualifiers.py new file mode 100644 index 0000000..51168e4 --- /dev/null +++ b/librarian/fn_qualifiers.py @@ -0,0 +1,198 @@ +# -*- coding: utf-8 +""" +List of standard footnote qualifiers. +This file is generated by scripts/fn_qualifiers_list_from_wiki.py, +do not edit it. +""" +from __future__ import unicode_literals + + +FN_QUALIFIERS = { + 'a.': 'albo', + 'alb.': 'albaÅski', + 'amer.': 'amerykaÅski, amerykaÅskie', + 'anat.': 'anatomiczne', + 'ang.': 'angielski, angielskie', + 'antr.': 'antropologia, antropologiczny', + 'ar.': 'arabski', + 'archeol.': 'archeologia, archeologiczny', + 'archit.': 'architektura', + 'astr.': 'astronomia', + 'austr.': 'austriacki', + 'austral.': 'australijski', + 'B.': 'biernik', + 'biaÅorus.': 'biaÅoruski', + 'biol.': 'biologia, biologiczny', + 'blm': 'bez liczby mnogiej', + 'blp': 'bez liczby pojedynczej', + 'bot.': 'botanika', + 'buÅg.': 'buÅgarski', + 'C.': 'celownik', + 'celt.': 'celtycki', + 'chem.': 'chemiczny', + 'chiÅ.': 'chiÅski', + 'chrzeÅc.': 'chrzeÅcijaÅski, chrzeÅcijaÅstwo', + 'cz.': 'czas (gramatyczny)', + 'cz.przesz.': 'czas przeszÅy', + 'cz.przysz.': 'czas przyszÅy', + 'cz.ter.': 'czas teraÅºniejszy', + 'czas.': 'czasownik', + 'czes.': 'czeski', + 'D.': 'dopeÅniacz', + 'daw.': 'dawne', + 'dk': 'dokonane', + 'druk.': 'drukarstwo, drukowany', + 'dziec.': 'dzieciÄcy', + 'egip.': 'egipski', + 'ekon.': 'ekonomiczny', + 'elektr.': 'elektryczny', + 'etn.': 'etnografia, etniczny', + 'euf.': 'eufemizm', + 'film.': 'filmowy', + 'filoz.': 'filozoficzny', + 'fiÅ.': 'fiÅski', + 'fiz.': 'fizyka', + 'fizjol.': 'fizjologia', + 'fot.': 'fotografia, fotograficzny', + 'fr.': 'francuski', + 'fraz.': 'frazeologia, frazeologiczny', + 'fragm.': 'fragment', + 'genet.': 'genetyka, genetyczny', + 'geogr.': 'geografia, geograficzny', + 'geol.': 'geologia', + 'geom.': 'geometria', + 'gr.': 'grecki', + 'gw.': 'gwara, gwarowe', + 'hand.': 'handel, handlowy', + 'hebr.': 'hebrajski', + 'hind.': 'hinduski', + 'hist.': 'historia, historyczny', + 'hiszp.': 'hiszpaÅski', + 'hol.': 'holenderski', + 'im.': 'imienia', + 'imiesÅ.': 'imiesÅÃ³w, imiesÅowowy', + 'in.': 'inne, inny', + 'inf.': 'informacja', + 'inform.': 'informatyka', + 'irl.': 'irlandzki', + 'iron.': 'ironicznie', + 'isl.': 'islandzki', + 'itd.': 'i tak dalej', + 'itp.': 'i tym podobne', + 'jap.': 'japoÅski', + 'jÄz.': 'jÄzyk, jÄzykowy, jÄzykoznawstwo', + 'kg': 'kilogram', + 'km': 'kilometr', + 'lit.': 'literacki, literatura', + 'lm': 'liczba mnoga', + 'Åac.': 'Åacina, ÅaciÅskie', + 'M.': 'mianownik', + 'm.': 'mÄski', + 'mat.': 'matematyka', + 'med.': 'medyczne', + 'meteor.': 'meteorologia, meteorologiczny', + 'min.': 'minuta', + 'm.in.': 'miÄdzy innymi', + 'miner.': 'mineralogia', + 'mit.': 'mitologia', + 'mit. germ.': 'mitologia germaÅska', + 'mit. gr.': 'mitologia grecka', + 'mit. rzym.': 'mitologia rzymska', + 'mors.': 'morskie', + 'm.-os.': 'mÄskoosobowy', + 'Ms.': 'miejscownik', + 'muz.': 'muzyczny', + 'N.': 'narzÄdnik', + 'n.': 'nijaki', + 'ndk': 'niedokonany', + 'ndm': 'nieodmienny', + 'n.e.': 'nasza era', + 'nieos.': 'nieosobowy', + 'niem.': 'niemiecki', + 'norw.': 'norweski', + 'np.': 'na przykÅad', + 'obelÅ¼.': 'obelÅ¼ywie', + 'odm.': 'odmienny', + 'ok.': 'okoÅo', + 'os.': 'osoba, osobowy', + 'pÅd.': 'poÅudniowy', + 'pÅn.': 'pÃ³Ånocny', + 'p.n.e.': 'przed naszÄ erÄ', + 'pocz.': 'poczÄtek', + 'poet.': 'poetyckie', + 'pogard.': 'pogardliwe', + 'pol.': 'polski', + 'polit.': 'polityczny', + 'poÅ.': 'poÅowa', + 'popr.': 'poprawnie', + 'por.': 'porÃ³wnaj', + 'port.': 'portugalski', + 'posp.': 'pospolity', + 'pot.': 'potocznie', + 'praw.': 'prawo, prawnicze', + 'przen.': 'przenoÅnie', + 'przestarz.': 'przestarzaÅe', + 'przesz.': 'przeszÅy', + 'przym.': 'przymiotnik', + 'przysÅ.': 'przysÅowiowy', + 'przysÅÃ³w.': 'przysÅÃ³wek', + 'przysz.': 'przyszÅy', + 'psychol.': 'psychologia, psychologiczny', + 'r.': 'rok', + 'r.m.': 'rodzaj mÄski', + 'r.n.': 'rodzaj nijaki', + 'r.Å¼.': 'rodzaj Å¼eÅski', + 'reg.': 'regionalne', + 'rel.': 'religijny, religioznawstwo', + 'rodz.': 'rodzaj', + 'roln.': 'rolnictwo, rolniczy', + 'ros.': 'rosyjski', + 'rub.': 'rubasznie', + 'rum.': 'rumuÅski', + 'rzad.': 'rzadki', + 'rzecz.': 'rzeczownik', + 'rzym.': 'rzymski', + 'skand.': 'skandynawski', + 'skrÃ³t.': 'skrÃ³towiec', + 'sÅowac.': 'sÅowacki', + 'socjol.': 'socjologiczny', + 'sport.': 'sportowy', + 'st.': 'stopieÅ', + 'starop.': 'staropolskie', + 'staroÅ¼.': 'staroÅ¼ytny', + 'szt.': 'sztuka', + 'szwedz.': 'szwedzki', + 'År.': 'Årodek, Årodkowy', + 'Årod.': 'Årodowiskowy', + 'teatr.': 'teatralny', + 'techn.': 'techniczny', + 'temp.': 'temperatura', + 'ter.': 'teraÅºniejszy', + 'tur.': 'turecki', + 'tur.-tat.': 'turecko-tatarski', + 'tys.': 'tysiÄc', + 'tzn.': 'to znaczy', + 'uczn.': 'uczniowski', + 'ukr.': 'ukraiÅski', + 'urb.': 'urbanistyka', + 'W.': 'woÅacz', + 'w.': 'wiek', + 'wÄg.': 'wÄgierski', + 'wg': 'wedÅug', + 'wÅ.': 'wÅoski', + 'wojsk.': 'wojskowy', + 'wsch.': 'wschodni', + 'wspÃ³Å.': 'wspÃ³ÅczeÅnie', + 'wulg.': 'wulgarne', + 'wym.': 'wymawiaj', + 'zach.': 'zachodnie', + 'zdr.': 'zdrobnienie', + 'zgr.': 'zgrubienie', + 'zn.': 'znaczy, znaczenie', + 'zob.': 'zobacz', + 'zool.': 'zoologia', + 'zwÅ.': 'zwÅaszcza', + 'Å¼.': 'Å¼eÅski', + 'Å¼art.': 'Å¼artobliwie', + 'Å¼egl.': 'Å¼eglarskie', + } diff --git a/librarian/html.py b/librarian/html.py index 85b9003..6115b31 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -294,7 +294,14 @@ def add_table_of_themes(root): def extract_annotations(html_path): - """For each annotation, yields a tuple: anchor, text, html.""" + """Extracts annotations from HTML for annotations dictionary. + + For each annotation, yields a tuple of: + anchor, footnote type, valid qualifiers, text, html. + + """ + from .fn_qualifiers import FN_QUALIFIERS + parser = etree.HTMLParser(encoding='utf-8') tree = etree.parse(html_path, parser) footnotes = tree.find('//*[@id="footnotes"]') @@ -309,10 +316,21 @@ def extract_annotations(html_path): footnote[-1].tail = None text_str = etree.tostring(footnote, method='text', encoding=unicode).strip() html_str = etree.tostring(footnote, method='html', encoding=unicode).strip() - qualifier = None + match = re_qualifier.match(text_str) if match: - qualifier = match.group(1) + qualifier_str = match.group(1) + qualifiers = [] + for candidate in re.split('[;,]', qualifier_str): + candidate = candidate.strip() + if candidate in FN_QUALIFIERS: + qualifiers.append(candidate) + elif candidate.startswith('z '): + subcandidate = candidate.split()[1] + if subcandidate in FN_QUALIFIERS: + qualifiers.append(subcandidate) + else: + qualifiers = [] - yield anchor, fn_type, qualifier, text_str, html_str + yield anchor, fn_type, qualifiers, text_str, html_str diff --git a/scripts/fn_qualifiers_list_from_redmine.py b/scripts/fn_qualifiers_list_from_redmine.py new file mode 100644 index 0000000..020b119 --- /dev/null +++ b/scripts/fn_qualifiers_list_from_redmine.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 + +""" +This scripts reads the table of footnote qualifiers from Redmine +and produces contents of fn_qualifiers.py â a list of valid qualifiers. +""" + +from lxml import etree +from urllib2 import urlopen + +url = 'http://redmine.nowoczesnapolska.org.pl/projects/wl-publikacje/wiki/Lista_skr%C3%B3t%C3%B3w' + +parser = etree.HTMLParser() +tree = etree.parse(urlopen(url), parser) + +print """\ +# -*- coding: utf-8 +\""" +List of standard footnote qualifiers. +This file is generated by scripts/fn_qualifiers_list_from_wiki.py, +do not edit it. +\""" +from __future__ import unicode_literals + + +FN_QUALIFIERS = {""".encode('utf-8') + +for td in tree.findall('//td'): + print (" '%s': '%s'," % ( + td[0].text.replace('\\', '\\\\').replace("'", "\\'"), + td[0].tail.strip(' -').replace('\\', '\\\\').replace("'", "\\'") + )).encode('utf-8') + +print """ }""".encode('utf-8') diff --git a/tests/test_html_annotations.py b/tests/test_html_annotations.py index f269042..4956b7d 100644 --- a/tests/test_html_annotations.py +++ b/tests/test_html_annotations.py @@ -23,7 +23,7 @@ def test_annotations(): ('', ( 'pe', - None, + [], '', '

' ), @@ -32,7 +32,7 @@ def test_annotations(): ( 'Definiendum --- definiens.', ( 'pr', - None, + [], 'Definiendum \u2014 definiens.', '

Definiendum \u2014 definiens.

' ), @@ -40,7 +40,7 @@ def test_annotations(): ('Definiendum --- definiens.', ( 'pt', - None, + [], 'Definiendum \u2014 definiens.', '

Definiendum \u2014 definiens.

' ), @@ -48,7 +48,7 @@ def test_annotations(): ('Definiendum (Åac.) --- definiens.', ( 'pr', - 'Åac.', + ['Åac.'], 'Definiendum (Åac.) \u2014 definiens.', '

Definiendum (Åac.) \u2014 definiens.

' ), @@ -56,7 +56,7 @@ def test_annotations(): ('Definiendum (Åac.) --- definiens.', ( 'pe', - 'Åac.', + ['Åac.'], 'Definiendum (Åac.) \u2014 definiens.', '

Definiendum (Åac.) \u2014 definiens.

' ), @@ -64,7 +64,7 @@ def test_annotations(): (' Definiendum (daw.) --- definiens.', ( 'pt', - 'daw.', + ['daw.'], 'Definiendum (daw.) \u2014 definiens.', '

Definiendum (daw.) \u2014 definiens.

' ), @@ -72,7 +72,7 @@ def test_annotations(): ('Definiendum (Åac.) --- definiens.', ( 'pr', - 'Åac.', + ['Åac.'], 'Definiendum (Åac.) \u2014 definiens.', '

Definiendum (Åac.) \u2014 definiens.

' ), @@ -80,7 +80,7 @@ def test_annotations(): ('Definiendum (Åac.) --- definiens.', ( 'pe', - 'Åac.', + ['Åac.'], 'Definiendum (Åac.) \u2014 definiens.', '

Definiendum (Åac.) \u2014 definiens.

' ), @@ -88,12 +88,20 @@ def test_annotations(): ('Definiendum (Åac.) --- definiens (some) --- more text.', ( 'pe', - 'Åac.', + ['Åac.'], 'Definiendum (Åac.) \u2014 definiens (some) \u2014 more text.', '

Definiendum (Åac.) \u2014 definiens (some) \u2014 more text.

', ), 'Footnote with a second parentheses and mdash.'), + ('gemajna (daw., z niem. gemein: zwykÅy) --- czÄÅciej: gemajn, szeregowiec w wojsku polskim cudzoziemskiego autoramentu.', ( + 'pe', + ['daw.', 'niem.'], + 'gemajna (daw., z niem. gemein: zwykÅy) \u2014 czÄÅciej: gemajn, szeregowiec w wojsku polskim cudzoziemskiego autoramentu.', + '

gemajna (daw., z niem. gemein: zwykÅy) \u2014 czÄÅciej: gemajn, szeregowiec w wojsku polskim cudzoziemskiego autoramentu.

' + ), + 'Footnote with multiple and qualifiers and emphasis.'), + ) xml_src = ''' %s ''' % "".join( -- 2.20.1 From 5f92f5a341c58a30c0a5c0d64ba3ed498bb9db85 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Sun, 28 Dec 2014 00:35:25 +0100 Subject: [PATCH 14/16] Preliminary math and tables support. --- librarian/embeds/__init__.py | 56 ++++++++++++++++ librarian/embeds/latex.py | 21 ++++++ librarian/embeds/mathml.py | 10 +++ librarian/epub.py | 2 + librarian/epub/style.css | 7 ++ librarian/epub/xsltScheme.xsl | 27 +++++++- librarian/fb2/paragraphs.xslt | 10 +++ librarian/functions.py | 21 ++++++ librarian/pdf.py | 13 ++++ librarian/pdf/wl.cls | 5 ++ librarian/pdf/wl2tex.xslt | 53 +++++++++++++++ librarian/res/embeds/latex/template.tex | 9 +++ librarian/res/embeds/mathml/mathml2latex.xslt | 66 +++++++++++++++++++ librarian/xslt/book2html.xslt | 16 ++++- librarian/xslt/book2txt.xslt | 18 +++++ setup.py | 2 +- 16 files changed, 333 insertions(+), 3 deletions(-) create mode 100644 librarian/embeds/__init__.py create mode 100644 librarian/embeds/latex.py create mode 100644 librarian/embeds/mathml.py create mode 100644 librarian/res/embeds/latex/template.tex create mode 100644 librarian/res/embeds/mathml/mathml2latex.xslt diff --git a/librarian/embeds/__init__.py b/librarian/embeds/__init__.py new file mode 100644 index 0000000..3b1abdb --- /dev/null +++ b/librarian/embeds/__init__.py @@ -0,0 +1,56 @@ +import importlib +from lxml import etree + +known_types = { + 'application/mathml+xml': 'librarian.embeds.mathml.MathML', + 'application/x-latex': 'librarian.embeds.latex.LaTeX', +} + +class Embed(): + @classmethod + def transforms_to(cls, mime_types, downgrade=False): + matches = set() + for name, method in cls.__dict__.iteritems(): + if hasattr(method, "embed_converts_to"): + conv_type, conv_downgrade = method.embed_converts_to + if downgrade == conv_downgrade and conv_type in mime_types: + matches.add(conv_type) + return matches + + def transform_to(self, mime_type, downgrade=False): + for name, method in type(cls).__dict__.iteritems(): + if hasattr(method, "embed_converts_to"): + conv_type, conv_downgrade = method.embed_converts_to + if downgrade == conv_downgrade and conv_type == mime_type: + return method(self) + + +class DataEmbed(Embed): + def __init__(self, data=None): + self.data = data + +class TreeEmbed(Embed): + def __init__(self, tree=None): + if isinstance(tree, etree._Element): + tree = etree.ElementTree(tree) + self.tree = tree + +def converts_to(mime_type, downgrade=False): + def decorator(method): + method.embed_converts_to = mime_type, downgrade + return method + return decorator + +def downgrades_to(mime_type): + return converts_to(mime_type, True) + +def create_embed(mime_type, tree=None, data=None): + embed = known_types.get(mime_type) + if embed is None: + embed = DataEmbed if tree is None else TreeEmbed + else: + mod_name, cls_name = embed.rsplit('.', 1) + mod = importlib.import_module(mod_name) + embed = getattr(mod, cls_name) + + return embed(data if tree is None else tree) diff --git a/librarian/embeds/latex.py b/librarian/embeds/latex.py new file mode 100644 index 0000000..e10d165 --- /dev/null +++ b/librarian/embeds/latex.py @@ -0,0 +1,21 @@ +import os +import shutil +from subprocess import call, PIPE +from tempfile import mkdtemp +from librarian import get_resource +from . import DataEmbed, create_embed, downgrades_to, converts_to + +class LaTeX(DataEmbed): + @downgrades_to('image/png') + def to_png(self): + tmpl = open(get_resource('res/embeds/latex/template.tex')).read().decode('utf-8') + tempdir = mkdtemp('-librarian-embed-latex') + fpath = os.path.join(tempdir, 'doc.tex') + with open(fpath, 'w') as f: + f.write((tmpl % {'code': self.data}).encode('utf-8')) + call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE) + call(['convert', '-density', '150', os.path.join(tempdir, 'doc.pdf'), '-trim', + os.path.join(tempdir, 'doc.png')]) + pngdata = open(os.path.join(tempdir, 'doc.png')).read() + shutil.rmtree(tempdir) + return create_embed('image/png', data=pngdata) diff --git a/librarian/embeds/mathml.py b/librarian/embeds/mathml.py new file mode 100644 index 0000000..f99f979 --- /dev/null +++ b/librarian/embeds/mathml.py @@ -0,0 +1,10 @@ +from lxml import etree +from librarian import get_resource +from . import TreeEmbed, create_embed, downgrades_to, converts_to + +class MathML(TreeEmbed): + @downgrades_to('application/x-latex') + def to_latex(self): + xslt = etree.parse(get_resource('res/embeds/mathml/mathml2latex.xslt')) + output = self.tree.xslt(xslt) + return create_embed('application/x-latex', data=unicode(output)) diff --git a/librarian/epub.py b/librarian/epub.py index 1ea2688..bf58a9f 100644 --- a/librarian/epub.py +++ b/librarian/epub.py @@ -520,6 +520,8 @@ def transform(wldoc, verbose=False, output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False) zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED) + functions.reg_mathml_epub(zip) + # write static elements mime = zipfile.ZipInfo() mime.filename = 'mimetype' diff --git a/librarian/epub/style.css b/librarian/epub/style.css index 1f5d11b..57f5490 100644 --- a/librarian/epub/style.css +++ b/librarian/epub/style.css @@ -368,3 +368,10 @@ p.minor-info { p.footer { margin-top: 2em; } + +table { + border-collapse: collapse; +} +td { + border: 1px solid black; +} diff --git a/librarian/epub/xsltScheme.xsl b/librarian/epub/xsltScheme.xsl index d2d7871..1c066d9 100644 --- a/librarian/epub/xsltScheme.xsl +++ b/librarian/epub/xsltScheme.xsl @@ -1,5 +1,5 @@ - + @@ -284,6 +284,31 @@ + + + + + + + + + + + +

+ + + + + + + + + + + + + diff --git a/librarian/fb2/paragraphs.xslt b/librarian/fb2/paragraphs.xslt index 68c6257..334412e 100644 --- a/librarian/fb2/paragraphs.xslt +++ b/librarian/fb2/paragraphs.xslt @@ -39,6 +39,16 @@

ââââââââ

+ +

+ + + + + + + + diff --git a/librarian/functions.py b/librarian/functions.py index b88a7fb..659bb94 100644 --- a/librarian/functions.py +++ b/librarian/functions.py @@ -121,3 +121,24 @@ def reg_lang_code_3to2(): _register_function(lang_code_3to2) +def mathml_latex(context, trees): + from librarian.embeds.mathml import MathML + text = MathML(trees[0]).to_latex().data + # Remove invisible multiplications, they produce unwanted spaces. + text = text.replace(u'\u2062', '') + return text + +def reg_mathml_latex(): + _register_function(mathml_latex) + +def reg_mathml_epub(zipf): + from librarian.embeds.mathml import MathML + def mathml(context, trees): + data = MathML(trees[0]).to_latex().to_png().data + name = "math%d.png" % mathml.count + mathml.count += 1 + zipf.writestr('OPS/' + name, data) + return name + mathml.count = 0 + _register_function(mathml) + diff --git a/librarian/pdf.py b/librarian/pdf.py index 12c07ea..95883e1 100644 --- a/librarian/pdf.py +++ b/librarian/pdf.py @@ -95,6 +95,17 @@ def fix_hanging(doc): exclude=[DCNS("identifier.url"), DCNS("rights.license")] ) +def fix_tables(doc): + for kol in doc.iter(tag='kol'): + if kol.tail is not None: + if not kol.tail.strip(): + kol.tail = None + for table in doc.iter(tag='tabela'): + if table.get('ramka') == '1' or table.get('ramki') == '1': + table.set('_format', '|' + 'X|' * len(table[0])) + else: + table.set('_format', 'X' * len(table[0])) + def move_motifs_inside(doc): """ moves motifs to be into block elements """ @@ -245,10 +256,12 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None, parse_creator(document.edoc) substitute_hyphens(document.edoc) fix_hanging(document.edoc) + fix_tables(document.edoc) # wl -> TeXML style_filename = get_stylesheet("wl2tex") style = etree.parse(style_filename) + functions.reg_mathml_latex() # TeXML -> LaTeX temp = mkdtemp('-wl2pdf') diff --git a/librarian/pdf/wl.cls b/librarian/pdf/wl.cls index 8907b08..a802e20 100644 --- a/librarian/pdf/wl.cls +++ b/librarian/pdf/wl.cls @@ -73,6 +73,11 @@ \usepackage{xunicode} \usepackage{xltxtra} +\usepackage{longtable} +\usepackage{tabu} +\usepackage{unicode-math} +\setmathfont{Latin Modern Math} + \usepackage[overload]{textcase} \usepackage{scalefnt} \usepackage[colorlinks=true,linkcolor=black,setpagesize=false,urlcolor=black,xetex]{hyperref} diff --git a/librarian/pdf/wl2tex.xslt b/librarian/pdf/wl2tex.xslt index d39b61a..2548abc 100644 --- a/librarian/pdf/wl2tex.xslt +++ b/librarian/pdf/wl2tex.xslt @@ -435,6 +435,59 @@ + + + $ + + $ + + + + + + $$ + + $$ + + + + + + 1em + + to \textwidth + +

+ + + + + + + + + + + + 1em + + + + + + + + + + + + + + + + + + diff --git a/librarian/res/embeds/latex/template.tex b/librarian/res/embeds/latex/template.tex new file mode 100644 index 0000000..8e4b807 --- /dev/null +++ b/librarian/res/embeds/latex/template.tex @@ -0,0 +1,9 @@ +\documentclass{article} +\usepackage{unicode-math} +\setmathfont{Latin Modern Math} +\pagestyle{empty} +\begin{document} + +$%(code)s$ + +\end{document} diff --git a/librarian/res/embeds/mathml/mathml2latex.xslt b/librarian/res/embeds/mathml/mathml2latex.xslt new file mode 100644 index 0000000..76ccf95 --- /dev/null +++ b/librarian/res/embeds/mathml/mathml2latex.xslt @@ -0,0 +1,66 @@ + + + + + + \textrm{ + + } + + + + + + + + + + + + + + + + { + + }^{ + + } + + + + { + + }_{ + + } + + + + { + + } + + + + ( + + ) + + + + \frac{ + + }{ + + } + + + + \varepsilon + + + diff --git a/librarian/xslt/book2html.xslt b/librarian/xslt/book2html.xslt index 499a1dc..201381c 100644 --- a/librarian/xslt/book2html.xslt +++ b/librarian/xslt/book2html.xslt @@ -231,6 +231,20 @@

+ +

+ + + + + + + + + + + + @@ -244,7 +258,7 @@ - + diff --git a/librarian/xslt/book2txt.xslt b/librarian/xslt/book2txt.xslt index 317e581..a578492 100644 --- a/librarian/xslt/book2txt.xslt +++ b/librarian/xslt/book2txt.xslt @@ -233,6 +233,24 @@ + + + + + + + + + + + + + + + + + + diff --git a/setup.py b/setup.py index 732f145..10abe6e 100755 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ setup( maintainer='Radek Czajka', maintainer_email='radoslaw.czajka@nowoczesnapolska.org.pl', url='http://github.com/fnp/librarian', - packages=['librarian'], + packages=['librarian', 'librarian.embeds'], package_data={'librarian': ['xslt/*.xslt', 'epub/*', 'mobi/*', 'pdf/*', 'fb2/*', 'fonts/*'] + whole_tree(os.path.join(os.path.dirname(__file__), 'librarian'), 'res') + whole_tree(os.path.join(os.path.dirname(__file__), 'librarian'), 'font-optimizer')}, -- 2.20.1 From 141733d3db8c11f1eb69a9c0195f07c3c2ed3f8f Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Thu, 16 Apr 2015 13:20:40 +0200 Subject: [PATCH 15/16] Minor fixups. --- librarian/pdf/wl2tex.xslt | 2 +- librarian/res/embeds/mathml/mathml2latex.xslt | 20 +++++++++---------- librarian/xslt/book2html.xslt | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/librarian/pdf/wl2tex.xslt b/librarian/pdf/wl2tex.xslt index 2548abc..4d7ff03 100644 --- a/librarian/pdf/wl2tex.xslt +++ b/librarian/pdf/wl2tex.xslt @@ -36,7 +36,7 @@ - \usepackage[maxfloats=64]{morefloats} + \usepackage[maxfloats=53]{morefloats} diff --git a/librarian/res/embeds/mathml/mathml2latex.xslt b/librarian/res/embeds/mathml/mathml2latex.xslt index 76ccf95..92f60fc 100644 --- a/librarian/res/embeds/mathml/mathml2latex.xslt +++ b/librarian/res/embeds/mathml/mathml2latex.xslt @@ -5,25 +5,25 @@ xmlns:mml="http://www.w3.org/1998/Math/MathML"> - + \textrm{

}

- +

- + {

}^{ @@ -31,7 +31,7 @@ xmlns:mml="http://www.w3.org/1998/Math/MathML"> }

- + {

}_{ @@ -39,19 +39,19 @@ xmlns:mml="http://www.w3.org/1998/Math/MathML"> }

- + {

}

- + (

)

- + \frac{

}{ @@ -59,7 +59,7 @@ xmlns:mml="http://www.w3.org/1998/Math/MathML"> }

- + \varepsilon diff --git a/librarian/xslt/book2html.xslt b/librarian/xslt/book2html.xslt index 201381c..9a2b771 100644 --- a/librarian/xslt/book2html.xslt +++ b/librarian/xslt/book2html.xslt @@ -242,7 +242,7 @@ - +

@@ -258,7 +258,7 @@ - +

-- 2.20.1 From ff2a09e9ecd8e9bede2d3572942bcd32f66f6198 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Wed, 6 May 2015 14:03:20 +0200 Subject: [PATCH 16/16] FB2 footnotes fix. --- librarian/fb2/fb2.xslt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/librarian/fb2/fb2.xslt b/librarian/fb2/fb2.xslt index 950b526..2f322e8 100644 --- a/librarian/fb2/fb2.xslt +++ b/librarian/fb2/fb2.xslt @@ -26,7 +26,7 @@

- +

-- 2.20.1

Przypisy