From 2c73c162844627d26991915fecc7e39f02bc34b8 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Mon, 6 Oct 2014 12:44:28 +0200 Subject: [PATCH] Change extract_annotations return value. --- librarian/html.py | 20 +++++-- librarian/xslt/book2html.xslt | 1 + setup.cfg | 9 ++++ tests/test_html_annotations.py | 97 ++++++++++++++++++++++++++++++++++ 4 files changed, 122 insertions(+), 5 deletions(-) create mode 100644 setup.cfg create mode 100644 tests/test_html_annotations.py diff --git a/librarian/html.py b/librarian/html.py index 0eeb76b..e084ed2 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -4,6 +4,7 @@ # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # import os +import re import cStringIO import copy @@ -292,17 +293,26 @@ def add_table_of_themes(root): root.insert(0, themes_div) - def extract_annotations(html_path): """For each annotation, yields a tuple: anchor, text, html.""" parser = etree.HTMLParser(encoding='utf-8') tree = etree.parse(html_path, parser) footnotes = tree.find('//*[@id="footnotes"]') + re_qualifier = re.compile(ur'[^\u2014]+\s+\((.+)\)\s+\u2014') if footnotes is not None: for footnote in footnotes.findall('div'): - anchor = footnote.find('a[@name]').get('name') + fn_type = footnote.get('class').split('-')[1] + anchor = footnote.find('a[@class="annotation"]').get('href')[1:] del footnote[:2] - text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip() - html_str = etree.tostring(footnote, method='html', encoding='utf-8') - yield anchor, text_str, html_str + footnote.text = None + if len(footnote) and footnote[-1].tail == '\n': + footnote[-1].tail = None + text_str = etree.tostring(footnote, method='text', encoding=unicode).strip() + html_str = etree.tostring(footnote, method='html', encoding=unicode).strip() + qualifier = None + match = re_qualifier.match(text_str) + if match: + qualifier = match.group(1) + + yield anchor, fn_type, qualifier, text_str, html_str diff --git a/librarian/xslt/book2html.xslt b/librarian/xslt/book2html.xslt index 5e3228a..499a1dc 100644 --- a/librarian/xslt/book2html.xslt +++ b/librarian/xslt/book2html.xslt @@ -40,6 +40,7 @@

Przypisy

+ fn- [] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..54c0b24 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,9 @@ +[nosetests] +detailed-errors=1 +with-coverage=1 +cover-package=librarian +cover-erase=1 +with-doctest=1 +exclude= + formats + tests,test_html_annotations diff --git a/tests/test_html_annotations.py b/tests/test_html_annotations.py new file mode 100644 index 0000000..87e9b01 --- /dev/null +++ b/tests/test_html_annotations.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 +from __future__ import unicode_literals + +from StringIO import StringIO +import tempfile +from librarian.parser import WLDocument +from librarian.html import extract_annotations +from lxml import etree +from nose.tools import eq_ + + +def _test_annotation(expected, got, name): + assert got[0].startswith('anchor-'), "%s: Unexpected anchor: '%s', should begin with 'anchor-'" % (name, got[0]) + eq_(expected[0], got[1], "%s: Unexpected type, expected '%s', got '%s'" % (name, expected[0], got[1])) + eq_(expected[1], got[2], "%s: Unexpected qualifier, expected '%s', got '%s'" % (name, expected[1], got[2])) + eq_(expected[2], got[3], "%s: Unexpected text representation, expected '%s', got '%s'" % (name, expected[2], got[3])) + exp_html = '
%s
' % (expected[0], expected[3]) + eq_(exp_html, got[4], "%s: Unexpected html representation, expected '%s', got '%s'" % (name, exp_html, got[4])) + + +def test_annotations(): + annotations = ( + + ('', ( + 'pe', + None, + '', + '

' + ), + 'Empty footnote'), + + ( + 'Definiendum --- definiens.', ( + 'pr', + None, + 'Definiendum \u2014 definiens.', + '

Definiendum \u2014 definiens.

' + ), + 'Plain footnote.'), + + ('Definiendum --- definiens.', ( + 'pt', + None, + 'Definiendum \u2014 definiens.', + '

Definiendum \u2014 definiens.

' + ), + 'Standard footnote.'), + + ('Definiendum (łac.) --- definiens.', ( + 'pr', + 'łac.', + 'Definiendum (łac.) \u2014 definiens.', + '

Definiendum (łac.) \u2014 definiens.

' + ), + 'Plain footnote with qualifier'), + + ('Definiendum (łac.) --- definiens.', ( + 'pe', + 'łac.', + 'Definiendum (łac.) \u2014 definiens.', + '

Definiendum (łac.) \u2014 definiens.

' + ), + 'Standard footnote with qualifier.'), + + (' Definiendum (daw.) --- definiens.', ( + 'pt', + 'daw.', + 'Definiendum (daw.) \u2014 definiens.', + '

Definiendum (daw.) \u2014 definiens.

' + ), + 'Standard footnote with leading whitespace and qualifier.'), + + ('Definiendum (łac.) --- definiens.', ( + 'pr', + 'łac.', + 'Definiendum (łac.) \u2014 definiens.', + '

Definiendum (łac.) \u2014 definiens.

' + ), + 'Plain footnote with qualifier and some emphasis.'), + + ('Definiendum (łac.) --- definiens.', ( + 'pe', + 'łac.', + 'Definiendum (łac.) \u2014 definiens.', + '

Definiendum (łac.) \u2014 definiens.

' + ), + 'Standard footnote with qualifier and some emphasis.'), + + ) + + xml_src = ''' %s ''' % "".join( + t[0] for t in annotations) + html = WLDocument.from_string(xml_src, parse_dublincore=False).as_html().get_file() + res_annotations = list(extract_annotations(html)) + + for i, (src, expected, name) in enumerate(annotations): + yield _test_annotation, expected, res_annotations[i], name -- 2.20.1