From a4d6831b455ed69b196905427600d7163cdd3fe8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Robert=20B=C5=82aut?= Date: Tue, 1 Apr 2014 13:00:25 +0200 Subject: [PATCH 01/16] [mobi] pep8 --- librarian/mobi.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/librarian/mobi.py b/librarian/mobi.py index 82ff343..b30c1fe 100644 --- a/librarian/mobi.py +++ b/librarian/mobi.py @@ -3,6 +3,7 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # + from copy import deepcopy import os import subprocess @@ -13,7 +14,7 @@ from librarian.cover import DefaultEbookCover from librarian import get_resource -def transform(wldoc, verbose=False, sample=None, cover=None, +def transform(wldoc, verbose=False, sample=None, cover=None, use_kindlegen=False, flags=None): """ produces a MOBI file @@ -30,7 +31,7 @@ def transform(wldoc, verbose=False, sample=None, cover=None, if not flags: flags = [] flags = list(flags) - + epub = document.as_epub(verbose=verbose, sample=sample, html_toc=True, cover=True, flags=flags) if verbose: @@ -39,20 +40,17 @@ def transform(wldoc, verbose=False, sample=None, cover=None, devnull = open("/dev/null", 'w') kwargs = {"stdout": devnull, "stderr": devnull} - output_file = NamedTemporaryFile(prefix='librarian', suffix='.mobi', delete=False) + output_file = NamedTemporaryFile(prefix='librarian', suffix='.mobi', + delete=False) output_file.close() if use_kindlegen: output_file_basename = os.path.basename(output_file.name) - subprocess.check_call( - ['kindlegen', '-c2', epub.get_filename(), '-o', output_file_basename], - **kwargs - ) + subprocess.check_call(['kindlegen', '-c2', epub.get_filename(), + '-o', output_file_basename], **kwargs) else: - subprocess.check_call( - ['ebook-convert', epub.get_filename(), output_file.name, - '--no-inline-toc', - '--mobi-file-type=both', - '--mobi-ignore-margins'], **kwargs - ) + subprocess.check_call(['ebook-convert', epub.get_filename(), + output_file.name, '--no-inline-toc', + '--mobi-file-type=both', + '--mobi-ignore-margins'], **kwargs) return OutputFile.from_filename(output_file.name) -- 2.20.1 From a10487d4ba6917735432fcbda98faf8fd35a77e3 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Robert=20B=C5=82aut?= Date: Tue, 1 Apr 2014 13:03:36 +0200 Subject: [PATCH 02/16] [epub] pep8 tabs removed --- librarian/epub.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/librarian/epub.py b/librarian/epub.py index e52e524..e8b972a 100644 --- a/librarian/epub.py +++ b/librarian/epub.py @@ -427,7 +427,7 @@ def transform(wldoc, verbose=False, chars = used_chars(html_tree.getroot()) zip.writestr('OPS/title.html', etree.tostring(html_tree, pretty_print = True, - xml_declaration = True, + xml_declaration = True, encoding = "utf-8", doctype='')) @@ -442,7 +442,7 @@ def transform(wldoc, verbose=False, html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl')) chars = used_chars(html_tree.getroot()) html_string = etree.tostring(html_tree, - pretty_print = True, + pretty_print = True, xml_declaration = True, encoding = "utf-8", doctype='')) if bound_cover.uses_dc_cover: @@ -594,9 +594,9 @@ def transform(wldoc, verbose=False, chars = chars.union(used_chars(html_tree.getroot())) zip.writestr('OPS/annotations.html', etree.tostring( html_tree, pretty_print = True, - xml_declaration = True, - encoding = "utf-8", - doctype='')) toc.add("Wesprzyj Wolne Lektury", "support.html") @@ -617,9 +617,9 @@ def transform(wldoc, verbose=False, chars.update(used_chars(html_tree.getroot())) zip.writestr('OPS/last.html', etree.tostring( html_tree, pretty_print = True, - xml_declaration = True, - encoding = "utf-8", - doctype='')) if not flags or not 'without-fonts' in flags: -- 2.20.1 From 26fba30d99a051b3779a36f332993795d7f1c376 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Robert=20B=C5=82aut?= Date: Tue, 1 Apr 2014 13:27:08 +0200 Subject: [PATCH 03/16] [epub] pep8 --- librarian/epub.py | 169 +++++++++++++++++++++++++--------------------- 1 file changed, 93 insertions(+), 76 deletions(-) diff --git a/librarian/epub.py b/librarian/epub.py index e8b972a..8ef436e 100644 --- a/librarian/epub.py +++ b/librarian/epub.py @@ -26,6 +26,7 @@ from librarian.hyphenator import Hyphenator functions.reg_person_name() functions.reg_lang_code_3to2() + def set_hyph_language(source_tree): def get_short_lng_code(text): result = '' @@ -34,18 +35,21 @@ def set_hyph_language(source_tree): for line in f: list = line.strip().split('|') if list[0] == text: - result=list[2] + result = list[2] if result == '': return text else: return result - bibl_lng = etree.XPath('//dc:language//text()', namespaces = {'dc':str(DCNS)})(source_tree) - short_lng = get_short_lng_code(bibl_lng[0]) + bibl_lng = etree.XPath('//dc:language//text()', + namespaces={'dc': str(DCNS)})(source_tree) + short_lng = get_short_lng_code(bibl_lng[0]) try: - return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' + short_lng + '.dic')) + return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' + + short_lng + '.dic')) except: pass - + + def hyphenate_and_fix_conjunctions(source_tree, hyph): if hyph is not None: texts = etree.XPath('/utwor/*[2]//text()')(source_tree) @@ -54,13 +58,14 @@ def hyphenate_and_fix_conjunctions(source_tree, hyph): newt = '' wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t) for w in wlist: - newt += hyph.inserted(w, u'\u00AD') + newt += hyph.inserted(w, u'\u00AD') newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt) if t.is_text: parent.text = newt elif t.is_tail: parent.tail = newt - + + def inner_xml(node): """ returns node's text and children as a string @@ -71,6 +76,7 @@ def inner_xml(node): nt = node.text if node.text is not None else '' return ''.join([nt] + [etree.tostring(child) for child in node]) + def set_inner_xml(node, text): """ sets node's text and children from a string @@ -134,7 +140,7 @@ def find_annotations(annotations, source, part_no): for child in source: if child.tag in ('pe', 'pa', 'pt', 'pr'): annotation = deepcopy(child) - number = str(len(annotations)+1) + number = str(len(annotations) + 1) annotation.set('number', number) annotation.set('part', str(part_no)) annotation.tail = '' @@ -159,7 +165,7 @@ class Stanza(object): >>> print etree.tostring(s) a c cbx/ ycd - + """ def __init__(self, stanza_elem): self.stanza = stanza_elem @@ -221,18 +227,17 @@ def add_to_manifest(manifest, partno): """ Adds a node to the manifest section in content.opf file """ partstr = 'part%d' % partno - e = manifest.makeelement(OPFNS('item'), attrib={ - 'id': partstr, - 'href': partstr + '.html', - 'media-type': 'application/xhtml+xml', - }) + e = manifest.makeelement( + OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html', + 'media-type': 'application/xhtml+xml'} + ) manifest.append(e) def add_to_spine(spine, partno): """ Adds a node to the spine section in content.opf file """ - e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno}); + e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno}) spine.append(e) @@ -246,7 +251,7 @@ class TOC(object): def add(self, name, part_href, level=0, is_part=True, index=None): assert level == 0 or index is None if level > 0 and self.children: - return self.children[-1].add(name, part_href, level-1, is_part) + return self.children[-1].add(name, part_href, level - 1, is_part) else: t = TOC(name) t.part_href = part_href @@ -304,7 +309,7 @@ class TOC(object): texts.append( "
%s
" % (depth, child.href(), child.name)) - texts.append(child.html_part(depth+1)) + texts.append(child.html_part(depth + 1)) return "\n".join(texts) def html(self): @@ -327,10 +332,10 @@ def chop(main_text): # prepare a container for each chunk part_xml = etree.Element('utwor') etree.SubElement(part_xml, 'master') - main_xml_part = part_xml[0] # master + main_xml_part = part_xml[0] # master last_node_part = False - + # the below loop are workaround for a problem with epubs in drama ebooks without acts is_scene = False is_act = False @@ -340,7 +345,7 @@ def chop(main_text): is_scene = True elif name == 'naglowek_akt': is_act = True - + for one_part in main_text: name = one_part.tag if is_act is False and is_scene is True: @@ -364,7 +369,7 @@ def chop(main_text): main_xml_part[:] = [deepcopy(one_part)] else: main_xml_part.append(deepcopy(one_part)) - last_node_part = False + last_node_part = False yield part_xml @@ -390,11 +395,12 @@ def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_s replace_by_verse(chunk_xml) html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl')) chars = used_chars(html_tree.getroot()) - output_html = etree.tostring(html_tree, pretty_print = True, - xml_declaration = True, - encoding = "utf-8", - doctype='') + output_html = etree.tostring( + html_tree, pretty_print=True, xml_declaration=True, + encoding="utf-8", + doctype='' + ) return output_html, toc, chars @@ -412,11 +418,10 @@ def transform(wldoc, verbose=False, """ processes one input file and proceeds to its children """ replace_characters(wldoc.edoc.getroot()) - + hyphenator = set_hyph_language(wldoc.edoc.getroot()) hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator) - - + # every input file will have a TOC entry, # pointing to starting chunk toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter) @@ -425,12 +430,15 @@ def transform(wldoc, verbose=False, # write book title page html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl')) chars = used_chars(html_tree.getroot()) - zip.writestr('OPS/title.html', - etree.tostring(html_tree, pretty_print = True, - xml_declaration = True, - encoding = "utf-8", - doctype='')) + zip.writestr( + 'OPS/title.html', + etree.tostring( + html_tree, pretty_print=True, xml_declaration=True, + encoding="utf-8", + doctype='' + ) + ) # add a title page TOC entry toc.add(u"Strona tytułowa", "title.html") elif wldoc.book_info.parts: @@ -441,12 +449,12 @@ def transform(wldoc, verbose=False, else: html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl')) chars = used_chars(html_tree.getroot()) - html_string = etree.tostring(html_tree, - pretty_print = True, - xml_declaration = True, - encoding = "utf-8", - doctype='') + html_string = etree.tostring( + html_tree, pretty_print=True, xml_declaration=True, + encoding="utf-8", + doctype='' + ) zip.writestr('OPS/part%d.html' % chunk_counter, html_string) add_to_manifest(manifest, chunk_counter) add_to_spine(spine, chunk_counter) @@ -486,7 +494,6 @@ def transform(wldoc, verbose=False, return toc, chunk_counter, chars, sample - document = deepcopy(wldoc) del wldoc @@ -517,13 +524,19 @@ def transform(wldoc, verbose=False, mime.compress_type = zipfile.ZIP_STORED mime.extra = '' zip.writestr(mime, 'application/epub+zip') - zip.writestr('META-INF/container.xml', '' \ - '' \ - '') - zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png')) - zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png')) + zip.writestr( + 'META-INF/container.xml', + '' + '' + '' + '' + ) + zip.write(get_resource('res/wl-logo-small.png'), + os.path.join('OPS', 'logo_wolnelektury.png')) + zip.write(get_resource('res/jedenprocent.png'), + os.path.join('OPS', 'jedenprocent.png')) if not style: style = get_resource('epub/style.css') zip.write(style, os.path.join('OPS', 'style.css')) @@ -542,9 +555,11 @@ def transform(wldoc, verbose=False, cover_tree = etree.parse(get_resource('epub/cover.html')) cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name) zip.writestr('OPS/cover.html', etree.tostring( - cover_tree, pretty_print = True, xml_declaration = True, encoding = "utf-8", - doctype='')) + cover_tree, pretty_print=True, xml_declaration=True, + encoding="utf-8", + doctype='' + )) if bound_cover.uses_dc_cover: if document.book_info.cover_by: @@ -560,14 +575,16 @@ def transform(wldoc, verbose=False, opf.getroot()[0].append(etree.fromstring('')) guide.append(etree.fromstring('')) - annotations = etree.Element('annotations') - toc_file = etree.fromstring('' \ - '' \ - '') + toc_file = etree.fromstring( + '' + '' + '' + ) nav_map = toc_file[-1] if html_toc: @@ -593,11 +610,11 @@ def transform(wldoc, verbose=False, html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl')) chars = chars.union(used_chars(html_tree.getroot())) zip.writestr('OPS/annotations.html', etree.tostring( - html_tree, pretty_print = True, - xml_declaration = True, - encoding = "utf-8", - doctype='')) + html_tree, pretty_print=True, xml_declaration=True, + encoding="utf-8", + doctype='' + )) toc.add("Wesprzyj Wolne Lektury", "support.html") manifest.append(etree.fromstring( @@ -616,11 +633,11 @@ def transform(wldoc, verbose=False, html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl')) chars.update(used_chars(html_tree.getroot())) zip.writestr('OPS/last.html', etree.tostring( - html_tree, pretty_print = True, - xml_declaration = True, - encoding = "utf-8", - doctype='')) + html_tree, pretty_print=True, xml_declaration=True, + encoding="utf-8", + doctype='' + )) if not flags or not 'without-fonts' in flags: # strip fonts @@ -632,8 +649,10 @@ def transform(wldoc, verbose=False, os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer')) for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf': - optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), - get_resource('fonts/' + fname), os.path.join(tmpdir, fname)] + optimizer_call = ['perl', 'subset.pl', '--chars', + ''.join(chars).encode('utf-8'), + get_resource('fonts/' + fname), + os.path.join(tmpdir, fname)] if verbose: print "Running font-optimizer" subprocess.check_call(optimizer_call) @@ -645,9 +664,8 @@ def transform(wldoc, verbose=False, rmtree(tmpdir) if cwd is not None: os.chdir(cwd) - zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print = True, - xml_declaration = True, - encoding = "utf-8")) + zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True, + xml_declaration=True, encoding="utf-8")) title = document.book_info.title attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber" for st in attributes: @@ -664,9 +682,8 @@ def transform(wldoc, verbose=False, toc.add(u"Spis treści", "toc.html", index=1) zip.writestr('OPS/toc.html', toc.html().encode('utf-8')) toc.write_to_xml(nav_map) - zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print = True, - xml_declaration = True, - encoding = "utf-8")) + zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True, + xml_declaration=True, encoding="utf-8")) zip.close() return OutputFile.from_filename(output_file.name) -- 2.20.1 From 6b77223ce55bfc487ca99b6c3fcb861f6923d74c Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Wed, 23 Apr 2014 12:26:20 +0200 Subject: [PATCH 04/16] Fix font-optimizer for Perl 5.18. --- librarian/font-optimizer/Font/Subsetter.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/librarian/font-optimizer/Font/Subsetter.pm b/librarian/font-optimizer/Font/Subsetter.pm index cd1c40c..7aa60dc 100644 --- a/librarian/font-optimizer/Font/Subsetter.pm +++ b/librarian/font-optimizer/Font/Subsetter.pm @@ -1493,7 +1493,7 @@ sub subset { $self->{features} = $options->{features}; - my $uid = substr(sha1_hex("$filename $chars"), 0, 16); + my $uid = substr(sha1_hex(encode_utf8("$filename $chars")), 0, 16); if (not $self->{font}) { $self->preload($filename); -- 2.20.1 From 13b88a3bfdd4fa3cf5c92368aa5cb60fa832eda6 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Wed, 23 Apr 2014 13:32:24 +0200 Subject: [PATCH 05/16] Minor fixes. --- librarian/book2anything.py | 12 +----------- librarian/epub/style.css | 2 +- librarian/mobi.py | 9 +-------- scripts/book2epub | 5 +++++ scripts/book2mobi | 8 +++++++- 5 files changed, 15 insertions(+), 21 deletions(-) diff --git a/librarian/book2anything.py b/librarian/book2anything.py index 7ae6178..20cae8f 100755 --- a/librarian/book2anything.py +++ b/librarian/book2anything.py @@ -56,12 +56,6 @@ class Book2Anything(object): parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='print status messages to stdout') - parser.add_option('-t', '--html-toc', - action='store_true', dest='html_toc', default=False, - help='with inline html toc [book2epub only]') - parser.add_option('-k', '--use-kindlegen', - action='store_true', dest='use_kindlegen', default=False, - help='use kindlegen tool [book2mobi only]') parser.add_option('-d', '--make-dir', action='store_true', dest='make_dir', default=False, help='create a directory for author and put the output file in it') @@ -103,11 +97,7 @@ class Book2Anything(object): if transform_flags: transform_args['flags'] = transform_flags if options.verbose: - transform_args['verbose'] = True - if options.html_toc and cls.ext == 'epub': - transform_args['html_toc'] = True - if options.use_kindlegen and cls.ext == 'mobi': - transform_args['use_kindlegen'] = True + transform_args['verbose'] = True # Add cover support, if any. if cls.uses_cover: if options.image_cache: diff --git a/librarian/epub/style.css b/librarian/epub/style.css index 6225c7a..1f5d11b 100644 --- a/librarian/epub/style.css +++ b/librarian/epub/style.css @@ -135,7 +135,7 @@ p { line-height: 0; font-size: 0.7em; -} +} /* =================== */ /* = Custom elements = */ diff --git a/librarian/mobi.py b/librarian/mobi.py index b30c1fe..a0e463a 100644 --- a/librarian/mobi.py +++ b/librarian/mobi.py @@ -10,8 +10,6 @@ import subprocess from tempfile import NamedTemporaryFile from librarian import OutputFile -from librarian.cover import DefaultEbookCover -from librarian import get_resource def transform(wldoc, verbose=False, sample=None, cover=None, @@ -26,14 +24,9 @@ def transform(wldoc, verbose=False, sample=None, cover=None, document = deepcopy(wldoc) del wldoc - book_info = document.book_info - - if not flags: - flags = [] - flags = list(flags) epub = document.as_epub(verbose=verbose, sample=sample, - html_toc=True, cover=True, flags=flags) + html_toc=True, cover=cover or True, flags=flags) if verbose: kwargs = {} else: diff --git a/scripts/book2epub b/scripts/book2epub index 01ca79a..7a7a41d 100755 --- a/scripts/book2epub +++ b/scripts/book2epub @@ -17,6 +17,11 @@ class Book2Epub(Book2Anything): action='store_true', default=False, help='mark the output as a working copy') ] + transform_options = [ + Option('-t', '--html-toc', + action='store_true', dest='html_toc', default=False, + help='with inline html toc') + ] if __name__ == '__main__': diff --git a/scripts/book2mobi b/scripts/book2mobi index f477a83..b283309 100755 --- a/scripts/book2mobi +++ b/scripts/book2mobi @@ -4,7 +4,7 @@ # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from librarian.book2anything import Book2Anything +from librarian.book2anything import Book2Anything, Option class Book2Mobi(Book2Anything): @@ -14,6 +14,12 @@ class Book2Mobi(Book2Anything): cover_optional = False uses_provider = True + transform_options = [ + Option('-k', '--use-kindlegen', + action='store_true', dest='use_kindlegen', default=False, + help='use kindlegen tool instead of Calibre') + ] + if __name__ == '__main__': Book2Mobi.run() -- 2.20.1 From 2a4236f2ddbe95d07ba7fcbe750451340c38c2b9 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Mon, 30 Jun 2014 10:03:24 +0200 Subject: [PATCH 06/16] Added developmentStage meta field. --- librarian/dcparser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/librarian/dcparser.py b/librarian/dcparser.py index 12bb24f..a33940d 100644 --- a/librarian/dcparser.py +++ b/librarian/dcparser.py @@ -245,6 +245,7 @@ class WorkInfo(object): Field( PLMETNS('digitisationSponsor'), 'sponsors', multiple=True, default=[]), Field( WLNS('digitisationSponsorNote'), 'sponsor_note', required=False), + Field( WLNS('developmentStage'), 'stage', required=False), ) @classmethod -- 2.20.1 From a91e41e489588ebad550cd9e22d157062effa2ff Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Mon, 30 Jun 2014 11:31:04 +0200 Subject: [PATCH 07/16] Allow no editors info in pdf, epub. --- librarian/epub.py | 6 ++++-- librarian/pdf.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/librarian/epub.py b/librarian/epub.py index 8ef436e..1ea2688 100644 --- a/librarian/epub.py +++ b/librarian/epub.py @@ -502,8 +502,10 @@ def transform(wldoc, verbose=False, document.edoc.getroot().set(flag, 'yes') # add editors info - document.edoc.getroot().set('editors', u', '.join(sorted( - editor.readable() for editor in document.editors()))) + editors = document.editors() + if editors: + document.edoc.getroot().set('editors', u', '.join(sorted( + editor.readable() for editor in editors))) if document.book_info.funders: document.edoc.getroot().set('funders', u', '.join( document.book_info.funders)) diff --git a/librarian/pdf.py b/librarian/pdf.py index 7aaff10..12c07ea 100644 --- a/librarian/pdf.py +++ b/librarian/pdf.py @@ -230,8 +230,10 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None, root.set('customizations', u','.join(customizations)) # add editors info - root.set('editors', u', '.join(sorted( - editor.readable() for editor in document.editors()))) + editors = document.editors() + if editors: + root.set('editors', u', '.join(sorted( + editor.readable() for editor in editors))) if document.book_info.funders: root.set('funders', u', '.join(document.book_info.funders)) if document.book_info.thanks: -- 2.20.1 From 2c73c162844627d26991915fecc7e39f02bc34b8 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Mon, 6 Oct 2014 12:44:28 +0200 Subject: [PATCH 08/16] Change extract_annotations return value. --- librarian/html.py | 20 +++++-- librarian/xslt/book2html.xslt | 1 + setup.cfg | 9 ++++ tests/test_html_annotations.py | 97 ++++++++++++++++++++++++++++++++++ 4 files changed, 122 insertions(+), 5 deletions(-) create mode 100644 setup.cfg create mode 100644 tests/test_html_annotations.py diff --git a/librarian/html.py b/librarian/html.py index 0eeb76b..e084ed2 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -4,6 +4,7 @@ # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # import os +import re import cStringIO import copy @@ -292,17 +293,26 @@ def add_table_of_themes(root): root.insert(0, themes_div) - def extract_annotations(html_path): """For each annotation, yields a tuple: anchor, text, html.""" parser = etree.HTMLParser(encoding='utf-8') tree = etree.parse(html_path, parser) footnotes = tree.find('//*[@id="footnotes"]') + re_qualifier = re.compile(ur'[^\u2014]+\s+\((.+)\)\s+\u2014') if footnotes is not None: for footnote in footnotes.findall('div'): - anchor = footnote.find('a[@name]').get('name') + fn_type = footnote.get('class').split('-')[1] + anchor = footnote.find('a[@class="annotation"]').get('href')[1:] del footnote[:2] - text_str = etree.tostring(footnote, method='text', encoding='utf-8').strip() - html_str = etree.tostring(footnote, method='html', encoding='utf-8') - yield anchor, text_str, html_str + footnote.text = None + if len(footnote) and footnote[-1].tail == '\n': + footnote[-1].tail = None + text_str = etree.tostring(footnote, method='text', encoding=unicode).strip() + html_str = etree.tostring(footnote, method='html', encoding=unicode).strip() + qualifier = None + match = re_qualifier.match(text_str) + if match: + qualifier = match.group(1) + + yield anchor, fn_type, qualifier, text_str, html_str diff --git a/librarian/xslt/book2html.xslt b/librarian/xslt/book2html.xslt index 5e3228a..499a1dc 100644 --- a/librarian/xslt/book2html.xslt +++ b/librarian/xslt/book2html.xslt @@ -40,6 +40,7 @@

Przypisy

+ fn- [] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..54c0b24 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,9 @@ +[nosetests] +detailed-errors=1 +with-coverage=1 +cover-package=librarian +cover-erase=1 +with-doctest=1 +exclude= + formats + tests,test_html_annotations diff --git a/tests/test_html_annotations.py b/tests/test_html_annotations.py new file mode 100644 index 0000000..87e9b01 --- /dev/null +++ b/tests/test_html_annotations.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 +from __future__ import unicode_literals + +from StringIO import StringIO +import tempfile +from librarian.parser import WLDocument +from librarian.html import extract_annotations +from lxml import etree +from nose.tools import eq_ + + +def _test_annotation(expected, got, name): + assert got[0].startswith('anchor-'), "%s: Unexpected anchor: '%s', should begin with 'anchor-'" % (name, got[0]) + eq_(expected[0], got[1], "%s: Unexpected type, expected '%s', got '%s'" % (name, expected[0], got[1])) + eq_(expected[1], got[2], "%s: Unexpected qualifier, expected '%s', got '%s'" % (name, expected[1], got[2])) + eq_(expected[2], got[3], "%s: Unexpected text representation, expected '%s', got '%s'" % (name, expected[2], got[3])) + exp_html = '
%s
' % (expected[0], expected[3]) + eq_(exp_html, got[4], "%s: Unexpected html representation, expected '%s', got '%s'" % (name, exp_html, got[4])) + + +def test_annotations(): + annotations = ( + + ('', ( + 'pe', + None, + '', + '

' + ), + 'Empty footnote'), + + ( + 'Definiendum --- definiens.', ( + 'pr', + None, + 'Definiendum \u2014 definiens.', + '

Definiendum \u2014 definiens.

' + ), + 'Plain footnote.'), + + ('Definiendum --- definiens.', ( + 'pt', + None, + 'Definiendum \u2014 definiens.', + '

Definiendum \u2014 definiens.

' + ), + 'Standard footnote.'), + + ('Definiendum (łac.) --- definiens.', ( + 'pr', + 'łac.', + 'Definiendum (łac.) \u2014 definiens.', + '

Definiendum (łac.) \u2014 definiens.

' + ), + 'Plain footnote with qualifier'), + + ('Definiendum (łac.) --- definiens.', ( + 'pe', + 'łac.', + 'Definiendum (łac.) \u2014 definiens.', + '

Definiendum (łac.) \u2014 definiens.

' + ), + 'Standard footnote with qualifier.'), + + (' Definiendum (daw.) --- definiens.', ( + 'pt', + 'daw.', + 'Definiendum (daw.) \u2014 definiens.', + '

Definiendum (daw.) \u2014 definiens.

' + ), + 'Standard footnote with leading whitespace and qualifier.'), + + ('Definiendum (łac.) --- definiens.', ( + 'pr', + 'łac.', + 'Definiendum (łac.) \u2014 definiens.', + '

Definiendum (łac.) \u2014 definiens.

' + ), + 'Plain footnote with qualifier and some emphasis.'), + + ('Definiendum (łac.) --- definiens.', ( + 'pe', + 'łac.', + 'Definiendum (łac.) \u2014 definiens.', + '

Definiendum (łac.) \u2014 definiens.

' + ), + 'Standard footnote with qualifier and some emphasis.'), + + ) + + xml_src = ''' %s ''' % "".join( + t[0] for t in annotations) + html = WLDocument.from_string(xml_src, parse_dublincore=False).as_html().get_file() + res_annotations = list(extract_annotations(html)) + + for i, (src, expected, name) in enumerate(annotations): + yield _test_annotation, expected, res_annotations[i], name -- 2.20.1 From 14fbd48817ba27853a45164908d10ed679acac6e Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Mon, 6 Oct 2014 12:55:12 +0200 Subject: [PATCH 09/16] Disable test for unsupported behaviour in pictures. --- setup.cfg | 9 --------- tests/files/picture/angelus-novus.xml | 2 +- tests/test_picture.py | 3 ++- 3 files changed, 3 insertions(+), 11 deletions(-) delete mode 100644 setup.cfg diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 54c0b24..0000000 --- a/setup.cfg +++ /dev/null @@ -1,9 +0,0 @@ -[nosetests] -detailed-errors=1 -with-coverage=1 -cover-package=librarian -cover-erase=1 -with-doctest=1 -exclude= - formats - tests,test_html_annotations diff --git a/tests/files/picture/angelus-novus.xml b/tests/files/picture/angelus-novus.xml index 964faed..85fa554 100644 --- a/tests/files/picture/angelus-novus.xml +++ b/tests/files/picture/angelus-novus.xml @@ -36,7 +36,7 @@
-
+ diff --git a/tests/test_picture.py b/tests/test_picture.py index f64f624..1169f44 100644 --- a/tests/test_picture.py +++ b/tests/test_picture.py @@ -42,7 +42,8 @@ def test_wlpicture(): def test_picture_parts(): wlp = picture.WLPicture.from_file(open(get_fixture('picture', 'angelus-novus.xml'))) parts = list(wlp.partiter()) - assert len(parts) == 5, "there should be %d parts of the picture" % 5 + expect_parts = 4 + assert len(parts) == expect_parts, "there should be %d parts of the picture" % expect_parts motifs = set() names = set() -- 2.20.1 From ac7899052e10143e0548ad7de2f67a2c6ca2b50b Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Thu, 9 Oct 2014 12:28:03 +0200 Subject: [PATCH 10/16] Fix in extract_annotations --- librarian/html.py | 2 +- tests/test_html_annotations.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/librarian/html.py b/librarian/html.py index e084ed2..85b9003 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -298,7 +298,7 @@ def extract_annotations(html_path): parser = etree.HTMLParser(encoding='utf-8') tree = etree.parse(html_path, parser) footnotes = tree.find('//*[@id="footnotes"]') - re_qualifier = re.compile(ur'[^\u2014]+\s+\((.+)\)\s+\u2014') + re_qualifier = re.compile(ur'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014') if footnotes is not None: for footnote in footnotes.findall('div'): fn_type = footnote.get('class').split('-')[1] diff --git a/tests/test_html_annotations.py b/tests/test_html_annotations.py index 87e9b01..851c5b0 100644 --- a/tests/test_html_annotations.py +++ b/tests/test_html_annotations.py @@ -86,6 +86,14 @@ def test_annotations(): ), 'Standard footnote with qualifier and some emphasis.'), + ('Definiendum (łac.) --- definens (some) --- more text.', ( + 'pe', + 'łac.', + 'Definiendum (łac.) \u2014 definiens (some) \u2014 more text.', + '

Definiendum (łac.) \u2014 definiens (some) \u2014 more text.

', + ), + 'Footnote with a second parentheses and mdash.'), + ) xml_src = ''' %s ''' % "".join( -- 2.20.1 From a3b6840527ec52ce8b6d74819633d8c85e3973ba Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Fri, 10 Oct 2014 15:08:02 +0200 Subject: [PATCH 11/16] Fix test. --- tests/test_html_annotations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_html_annotations.py b/tests/test_html_annotations.py index 851c5b0..f2b9eee 100644 --- a/tests/test_html_annotations.py +++ b/tests/test_html_annotations.py @@ -86,7 +86,7 @@ def test_annotations(): ), 'Standard footnote with qualifier and some emphasis.'), - ('Definiendum (łac.) --- definens (some) --- more text.', ( + ('Definiendum (łac.) --- definiens (some) --- more text.', ( 'pe', 'łac.', 'Definiendum (łac.) \u2014 definiens (some) \u2014 more text.', -- 2.20.1 From a04f11baee3eb7d090867c2d5639a120ec3217b8 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Wed, 22 Oct 2014 10:51:59 +0200 Subject: [PATCH 12/16] Data for #3396: picture style, full creation date. --- librarian/dcparser.py | 2 +- librarian/picture.py | 1 + tests/test_html_annotations.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/librarian/dcparser.py b/librarian/dcparser.py index a33940d..f413fac 100644 --- a/librarian/dcparser.py +++ b/librarian/dcparser.py @@ -230,7 +230,7 @@ class WorkInfo(object): salias='funder', multiple=True, default=[]), Field( DCNS('contributor.thanks'), 'thanks', required=False), - Field( DCNS('date'), 'created_at', as_date), + Field( DCNS('date'), 'created_at'), Field( DCNS('date.pd'), 'released_to_public_domain_at', as_date, required=False), Field( DCNS('publisher'), 'publisher'), diff --git a/librarian/picture.py b/librarian/picture.py index 5d644d7..5a0c47b 100644 --- a/librarian/picture.py +++ b/librarian/picture.py @@ -32,6 +32,7 @@ class PictureInfo(WorkInfo): Field(DCNS('subject.period'), 'epochs', salias='epoch', multiple=True), Field(DCNS('subject.type'), 'kinds', salias='kind', multiple=True), Field(DCNS('subject.genre'), 'genres', salias='genre', multiple=True, required=False), + Field(DCNS('subject.style'), 'styles', salias='style', multiple=True, required=False), Field(DCNS('format.dimensions'), 'dimensions', required=False), Field(DCNS('format.checksum.sha1'), 'sha1', required=True), diff --git a/tests/test_html_annotations.py b/tests/test_html_annotations.py index f2b9eee..f269042 100644 --- a/tests/test_html_annotations.py +++ b/tests/test_html_annotations.py @@ -80,8 +80,8 @@ def test_annotations(): ('Definiendum (łac.) --- definiens.', ( 'pe', - 'łac.', - 'Definiendum (łac.) \u2014 definiens.', + 'łac.', + 'Definiendum (łac.) \u2014 definiens.', '

Definiendum (łac.) \u2014 definiens.

' ), 'Standard footnote with qualifier and some emphasis.'), -- 2.20.1 From a3be479506edf42dc58feb22b26e4f5da1e49edd Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Thu, 23 Oct 2014 13:51:06 +0200 Subject: [PATCH 13/16] html.extract_annotations: Allow multiple footnote qualifiers. Use only ones accepted by editors. --- librarian/fn_qualifiers.py | 198 +++++++++++++++++++++ librarian/html.py | 26 ++- scripts/fn_qualifiers_list_from_redmine.py | 35 ++++ tests/test_html_annotations.py | 26 ++- 4 files changed, 272 insertions(+), 13 deletions(-) create mode 100644 librarian/fn_qualifiers.py create mode 100644 scripts/fn_qualifiers_list_from_redmine.py diff --git a/librarian/fn_qualifiers.py b/librarian/fn_qualifiers.py new file mode 100644 index 0000000..51168e4 --- /dev/null +++ b/librarian/fn_qualifiers.py @@ -0,0 +1,198 @@ +# -*- coding: utf-8 +""" +List of standard footnote qualifiers. +This file is generated by scripts/fn_qualifiers_list_from_wiki.py, +do not edit it. +""" +from __future__ import unicode_literals + + +FN_QUALIFIERS = { + 'a.': 'albo', + 'alb.': 'albański', + 'amer.': 'amerykański, amerykańskie', + 'anat.': 'anatomiczne', + 'ang.': 'angielski, angielskie', + 'antr.': 'antropologia, antropologiczny', + 'ar.': 'arabski', + 'archeol.': 'archeologia, archeologiczny', + 'archit.': 'architektura', + 'astr.': 'astronomia', + 'austr.': 'austriacki', + 'austral.': 'australijski', + 'B.': 'biernik', + 'białorus.': 'białoruski', + 'biol.': 'biologia, biologiczny', + 'blm': 'bez liczby mnogiej', + 'blp': 'bez liczby pojedynczej', + 'bot.': 'botanika', + 'bułg.': 'bułgarski', + 'C.': 'celownik', + 'celt.': 'celtycki', + 'chem.': 'chemiczny', + 'chiń.': 'chiński', + 'chrześc.': 'chrześcijański, chrześcijaństwo', + 'cz.': 'czas (gramatyczny)', + 'cz.przesz.': 'czas przeszły', + 'cz.przysz.': 'czas przyszły', + 'cz.ter.': 'czas teraźniejszy', + 'czas.': 'czasownik', + 'czes.': 'czeski', + 'D.': 'dopełniacz', + 'daw.': 'dawne', + 'dk': 'dokonane', + 'druk.': 'drukarstwo, drukowany', + 'dziec.': 'dziecięcy', + 'egip.': 'egipski', + 'ekon.': 'ekonomiczny', + 'elektr.': 'elektryczny', + 'etn.': 'etnografia, etniczny', + 'euf.': 'eufemizm', + 'film.': 'filmowy', + 'filoz.': 'filozoficzny', + 'fiń.': 'fiński', + 'fiz.': 'fizyka', + 'fizjol.': 'fizjologia', + 'fot.': 'fotografia, fotograficzny', + 'fr.': 'francuski', + 'fraz.': 'frazeologia, frazeologiczny', + 'fragm.': 'fragment', + 'genet.': 'genetyka, genetyczny', + 'geogr.': 'geografia, geograficzny', + 'geol.': 'geologia', + 'geom.': 'geometria', + 'gr.': 'grecki', + 'gw.': 'gwara, gwarowe', + 'hand.': 'handel, handlowy', + 'hebr.': 'hebrajski', + 'hind.': 'hinduski', + 'hist.': 'historia, historyczny', + 'hiszp.': 'hiszpański', + 'hol.': 'holenderski', + 'im.': 'imienia', + 'imiesł.': 'imiesłów, imiesłowowy', + 'in.': 'inne, inny', + 'inf.': 'informacja', + 'inform.': 'informatyka', + 'irl.': 'irlandzki', + 'iron.': 'ironicznie', + 'isl.': 'islandzki', + 'itd.': 'i tak dalej', + 'itp.': 'i tym podobne', + 'jap.': 'japoński', + 'jęz.': 'język, językowy, językoznawstwo', + 'kg': 'kilogram', + 'km': 'kilometr', + 'lit.': 'literacki, literatura', + 'lm': 'liczba mnoga', + 'łac.': 'łacina, łacińskie', + 'M.': 'mianownik', + 'm.': 'męski', + 'mat.': 'matematyka', + 'med.': 'medyczne', + 'meteor.': 'meteorologia, meteorologiczny', + 'min.': 'minuta', + 'm.in.': 'między innymi', + 'miner.': 'mineralogia', + 'mit.': 'mitologia', + 'mit. germ.': 'mitologia germańska', + 'mit. gr.': 'mitologia grecka', + 'mit. rzym.': 'mitologia rzymska', + 'mors.': 'morskie', + 'm.-os.': 'męskoosobowy', + 'Ms.': 'miejscownik', + 'muz.': 'muzyczny', + 'N.': 'narzędnik', + 'n.': 'nijaki', + 'ndk': 'niedokonany', + 'ndm': 'nieodmienny', + 'n.e.': 'nasza era', + 'nieos.': 'nieosobowy', + 'niem.': 'niemiecki', + 'norw.': 'norweski', + 'np.': 'na przykład', + 'obelż.': 'obelżywie', + 'odm.': 'odmienny', + 'ok.': 'około', + 'os.': 'osoba, osobowy', + 'płd.': 'południowy', + 'płn.': 'północny', + 'p.n.e.': 'przed naszą erą', + 'pocz.': 'początek', + 'poet.': 'poetyckie', + 'pogard.': 'pogardliwe', + 'pol.': 'polski', + 'polit.': 'polityczny', + 'poł.': 'połowa', + 'popr.': 'poprawnie', + 'por.': 'porównaj', + 'port.': 'portugalski', + 'posp.': 'pospolity', + 'pot.': 'potocznie', + 'praw.': 'prawo, prawnicze', + 'przen.': 'przenośnie', + 'przestarz.': 'przestarzałe', + 'przesz.': 'przeszły', + 'przym.': 'przymiotnik', + 'przysł.': 'przysłowiowy', + 'przysłów.': 'przysłówek', + 'przysz.': 'przyszły', + 'psychol.': 'psychologia, psychologiczny', + 'r.': 'rok', + 'r.m.': 'rodzaj męski', + 'r.n.': 'rodzaj nijaki', + 'r.ż.': 'rodzaj żeński', + 'reg.': 'regionalne', + 'rel.': 'religijny, religioznawstwo', + 'rodz.': 'rodzaj', + 'roln.': 'rolnictwo, rolniczy', + 'ros.': 'rosyjski', + 'rub.': 'rubasznie', + 'rum.': 'rumuński', + 'rzad.': 'rzadki', + 'rzecz.': 'rzeczownik', + 'rzym.': 'rzymski', + 'skand.': 'skandynawski', + 'skrót.': 'skrótowiec', + 'słowac.': 'słowacki', + 'socjol.': 'socjologiczny', + 'sport.': 'sportowy', + 'st.': 'stopień', + 'starop.': 'staropolskie', + 'staroż.': 'starożytny', + 'szt.': 'sztuka', + 'szwedz.': 'szwedzki', + 'śr.': 'środek, środkowy', + 'środ.': 'środowiskowy', + 'teatr.': 'teatralny', + 'techn.': 'techniczny', + 'temp.': 'temperatura', + 'ter.': 'teraźniejszy', + 'tur.': 'turecki', + 'tur.-tat.': 'turecko-tatarski', + 'tys.': 'tysiąc', + 'tzn.': 'to znaczy', + 'uczn.': 'uczniowski', + 'ukr.': 'ukraiński', + 'urb.': 'urbanistyka', + 'W.': 'wołacz', + 'w.': 'wiek', + 'węg.': 'węgierski', + 'wg': 'według', + 'wł.': 'włoski', + 'wojsk.': 'wojskowy', + 'wsch.': 'wschodni', + 'współ.': 'współcześnie', + 'wulg.': 'wulgarne', + 'wym.': 'wymawiaj', + 'zach.': 'zachodnie', + 'zdr.': 'zdrobnienie', + 'zgr.': 'zgrubienie', + 'zn.': 'znaczy, znaczenie', + 'zob.': 'zobacz', + 'zool.': 'zoologia', + 'zwł.': 'zwłaszcza', + 'ż.': 'żeński', + 'żart.': 'żartobliwie', + 'żegl.': 'żeglarskie', + } diff --git a/librarian/html.py b/librarian/html.py index 85b9003..6115b31 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -294,7 +294,14 @@ def add_table_of_themes(root): def extract_annotations(html_path): - """For each annotation, yields a tuple: anchor, text, html.""" + """Extracts annotations from HTML for annotations dictionary. + + For each annotation, yields a tuple of: + anchor, footnote type, valid qualifiers, text, html. + + """ + from .fn_qualifiers import FN_QUALIFIERS + parser = etree.HTMLParser(encoding='utf-8') tree = etree.parse(html_path, parser) footnotes = tree.find('//*[@id="footnotes"]') @@ -309,10 +316,21 @@ def extract_annotations(html_path): footnote[-1].tail = None text_str = etree.tostring(footnote, method='text', encoding=unicode).strip() html_str = etree.tostring(footnote, method='html', encoding=unicode).strip() - qualifier = None + match = re_qualifier.match(text_str) if match: - qualifier = match.group(1) + qualifier_str = match.group(1) + qualifiers = [] + for candidate in re.split('[;,]', qualifier_str): + candidate = candidate.strip() + if candidate in FN_QUALIFIERS: + qualifiers.append(candidate) + elif candidate.startswith('z '): + subcandidate = candidate.split()[1] + if subcandidate in FN_QUALIFIERS: + qualifiers.append(subcandidate) + else: + qualifiers = [] - yield anchor, fn_type, qualifier, text_str, html_str + yield anchor, fn_type, qualifiers, text_str, html_str diff --git a/scripts/fn_qualifiers_list_from_redmine.py b/scripts/fn_qualifiers_list_from_redmine.py new file mode 100644 index 0000000..020b119 --- /dev/null +++ b/scripts/fn_qualifiers_list_from_redmine.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 + +""" +This scripts reads the table of footnote qualifiers from Redmine +and produces contents of fn_qualifiers.py – a list of valid qualifiers. +""" + +from lxml import etree +from urllib2 import urlopen + +url = 'http://redmine.nowoczesnapolska.org.pl/projects/wl-publikacje/wiki/Lista_skr%C3%B3t%C3%B3w' + +parser = etree.HTMLParser() +tree = etree.parse(urlopen(url), parser) + +print """\ +# -*- coding: utf-8 +\""" +List of standard footnote qualifiers. +This file is generated by scripts/fn_qualifiers_list_from_wiki.py, +do not edit it. +\""" +from __future__ import unicode_literals + + +FN_QUALIFIERS = {""".encode('utf-8') + +for td in tree.findall('//td'): + print (" '%s': '%s'," % ( + td[0].text.replace('\\', '\\\\').replace("'", "\\'"), + td[0].tail.strip(' -').replace('\\', '\\\\').replace("'", "\\'") + )).encode('utf-8') + +print """ }""".encode('utf-8') diff --git a/tests/test_html_annotations.py b/tests/test_html_annotations.py index f269042..4956b7d 100644 --- a/tests/test_html_annotations.py +++ b/tests/test_html_annotations.py @@ -23,7 +23,7 @@ def test_annotations(): ('', ( 'pe', - None, + [], '', '

' ), @@ -32,7 +32,7 @@ def test_annotations(): ( 'Definiendum --- definiens.', ( 'pr', - None, + [], 'Definiendum \u2014 definiens.', '

Definiendum \u2014 definiens.

' ), @@ -40,7 +40,7 @@ def test_annotations(): ('Definiendum --- definiens.', ( 'pt', - None, + [], 'Definiendum \u2014 definiens.', '

Definiendum \u2014 definiens.

' ), @@ -48,7 +48,7 @@ def test_annotations(): ('Definiendum (łac.) --- definiens.', ( 'pr', - 'łac.', + ['łac.'], 'Definiendum (łac.) \u2014 definiens.', '

Definiendum (łac.) \u2014 definiens.

' ), @@ -56,7 +56,7 @@ def test_annotations(): ('Definiendum (łac.) --- definiens.', ( 'pe', - 'łac.', + ['łac.'], 'Definiendum (łac.) \u2014 definiens.', '

Definiendum (łac.) \u2014 definiens.

' ), @@ -64,7 +64,7 @@ def test_annotations(): (' Definiendum (daw.) --- definiens.', ( 'pt', - 'daw.', + ['daw.'], 'Definiendum (daw.) \u2014 definiens.', '

Definiendum (daw.) \u2014 definiens.

' ), @@ -72,7 +72,7 @@ def test_annotations(): ('Definiendum (łac.) --- definiens.', ( 'pr', - 'łac.', + ['łac.'], 'Definiendum (łac.) \u2014 definiens.', '

Definiendum (łac.) \u2014 definiens.

' ), @@ -80,7 +80,7 @@ def test_annotations(): ('Definiendum (łac.) --- definiens.', ( 'pe', - 'łac.', + ['łac.'], 'Definiendum (łac.) \u2014 definiens.', '

Definiendum (łac.) \u2014 definiens.

' ), @@ -88,12 +88,20 @@ def test_annotations(): ('Definiendum (łac.) --- definiens (some) --- more text.', ( 'pe', - 'łac.', + ['łac.'], 'Definiendum (łac.) \u2014 definiens (some) \u2014 more text.', '

Definiendum (łac.) \u2014 definiens (some) \u2014 more text.

', ), 'Footnote with a second parentheses and mdash.'), + ('gemajna (daw., z niem. gemein: zwykły) --- częściej: gemajn, szeregowiec w wojsku polskim cudzoziemskiego autoramentu.', ( + 'pe', + ['daw.', 'niem.'], + 'gemajna (daw., z niem. gemein: zwykły) \u2014 częściej: gemajn, szeregowiec w wojsku polskim cudzoziemskiego autoramentu.', + '

gemajna (daw., z niem. gemein: zwykły) \u2014 częściej: gemajn, szeregowiec w wojsku polskim cudzoziemskiego autoramentu.

' + ), + 'Footnote with multiple and qualifiers and emphasis.'), + ) xml_src = ''' %s ''' % "".join( -- 2.20.1 From 5f92f5a341c58a30c0a5c0d64ba3ed498bb9db85 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Sun, 28 Dec 2014 00:35:25 +0100 Subject: [PATCH 14/16] Preliminary math and tables support. --- librarian/embeds/__init__.py | 56 ++++++++++++++++ librarian/embeds/latex.py | 21 ++++++ librarian/embeds/mathml.py | 10 +++ librarian/epub.py | 2 + librarian/epub/style.css | 7 ++ librarian/epub/xsltScheme.xsl | 27 +++++++- librarian/fb2/paragraphs.xslt | 10 +++ librarian/functions.py | 21 ++++++ librarian/pdf.py | 13 ++++ librarian/pdf/wl.cls | 5 ++ librarian/pdf/wl2tex.xslt | 53 +++++++++++++++ librarian/res/embeds/latex/template.tex | 9 +++ librarian/res/embeds/mathml/mathml2latex.xslt | 66 +++++++++++++++++++ librarian/xslt/book2html.xslt | 16 ++++- librarian/xslt/book2txt.xslt | 18 +++++ setup.py | 2 +- 16 files changed, 333 insertions(+), 3 deletions(-) create mode 100644 librarian/embeds/__init__.py create mode 100644 librarian/embeds/latex.py create mode 100644 librarian/embeds/mathml.py create mode 100644 librarian/res/embeds/latex/template.tex create mode 100644 librarian/res/embeds/mathml/mathml2latex.xslt diff --git a/librarian/embeds/__init__.py b/librarian/embeds/__init__.py new file mode 100644 index 0000000..3b1abdb --- /dev/null +++ b/librarian/embeds/__init__.py @@ -0,0 +1,56 @@ +import importlib +from lxml import etree + +known_types = { + 'application/mathml+xml': 'librarian.embeds.mathml.MathML', + 'application/x-latex': 'librarian.embeds.latex.LaTeX', +} + +class Embed(): + @classmethod + def transforms_to(cls, mime_types, downgrade=False): + matches = set() + for name, method in cls.__dict__.iteritems(): + if hasattr(method, "embed_converts_to"): + conv_type, conv_downgrade = method.embed_converts_to + if downgrade == conv_downgrade and conv_type in mime_types: + matches.add(conv_type) + return matches + + def transform_to(self, mime_type, downgrade=False): + for name, method in type(cls).__dict__.iteritems(): + if hasattr(method, "embed_converts_to"): + conv_type, conv_downgrade = method.embed_converts_to + if downgrade == conv_downgrade and conv_type == mime_type: + return method(self) + + +class DataEmbed(Embed): + def __init__(self, data=None): + self.data = data + +class TreeEmbed(Embed): + def __init__(self, tree=None): + if isinstance(tree, etree._Element): + tree = etree.ElementTree(tree) + self.tree = tree + +def converts_to(mime_type, downgrade=False): + def decorator(method): + method.embed_converts_to = mime_type, downgrade + return method + return decorator + +def downgrades_to(mime_type): + return converts_to(mime_type, True) + +def create_embed(mime_type, tree=None, data=None): + embed = known_types.get(mime_type) + if embed is None: + embed = DataEmbed if tree is None else TreeEmbed + else: + mod_name, cls_name = embed.rsplit('.', 1) + mod = importlib.import_module(mod_name) + embed = getattr(mod, cls_name) + + return embed(data if tree is None else tree) diff --git a/librarian/embeds/latex.py b/librarian/embeds/latex.py new file mode 100644 index 0000000..e10d165 --- /dev/null +++ b/librarian/embeds/latex.py @@ -0,0 +1,21 @@ +import os +import shutil +from subprocess import call, PIPE +from tempfile import mkdtemp +from librarian import get_resource +from . import DataEmbed, create_embed, downgrades_to, converts_to + +class LaTeX(DataEmbed): + @downgrades_to('image/png') + def to_png(self): + tmpl = open(get_resource('res/embeds/latex/template.tex')).read().decode('utf-8') + tempdir = mkdtemp('-librarian-embed-latex') + fpath = os.path.join(tempdir, 'doc.tex') + with open(fpath, 'w') as f: + f.write((tmpl % {'code': self.data}).encode('utf-8')) + call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE) + call(['convert', '-density', '150', os.path.join(tempdir, 'doc.pdf'), '-trim', + os.path.join(tempdir, 'doc.png')]) + pngdata = open(os.path.join(tempdir, 'doc.png')).read() + shutil.rmtree(tempdir) + return create_embed('image/png', data=pngdata) diff --git a/librarian/embeds/mathml.py b/librarian/embeds/mathml.py new file mode 100644 index 0000000..f99f979 --- /dev/null +++ b/librarian/embeds/mathml.py @@ -0,0 +1,10 @@ +from lxml import etree +from librarian import get_resource +from . import TreeEmbed, create_embed, downgrades_to, converts_to + +class MathML(TreeEmbed): + @downgrades_to('application/x-latex') + def to_latex(self): + xslt = etree.parse(get_resource('res/embeds/mathml/mathml2latex.xslt')) + output = self.tree.xslt(xslt) + return create_embed('application/x-latex', data=unicode(output)) diff --git a/librarian/epub.py b/librarian/epub.py index 1ea2688..bf58a9f 100644 --- a/librarian/epub.py +++ b/librarian/epub.py @@ -520,6 +520,8 @@ def transform(wldoc, verbose=False, output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False) zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED) + functions.reg_mathml_epub(zip) + # write static elements mime = zipfile.ZipInfo() mime.filename = 'mimetype' diff --git a/librarian/epub/style.css b/librarian/epub/style.css index 1f5d11b..57f5490 100644 --- a/librarian/epub/style.css +++ b/librarian/epub/style.css @@ -368,3 +368,10 @@ p.minor-info { p.footer { margin-top: 2em; } + +table { + border-collapse: collapse; +} +td { + border: 1px solid black; +} diff --git a/librarian/epub/xsltScheme.xsl b/librarian/epub/xsltScheme.xsl index d2d7871..1c066d9 100644 --- a/librarian/epub/xsltScheme.xsl +++ b/librarian/epub/xsltScheme.xsl @@ -1,5 +1,5 @@ - + @@ -284,6 +284,31 @@ + + + + + + + + + + + +
+
+ + + + + + + + + + + + diff --git a/librarian/fb2/paragraphs.xslt b/librarian/fb2/paragraphs.xslt index 68c6257..334412e 100644 --- a/librarian/fb2/paragraphs.xslt +++ b/librarian/fb2/paragraphs.xslt @@ -39,6 +39,16 @@

————————

+ +
+
+ + + + + + + diff --git a/librarian/functions.py b/librarian/functions.py index b88a7fb..659bb94 100644 --- a/librarian/functions.py +++ b/librarian/functions.py @@ -121,3 +121,24 @@ def reg_lang_code_3to2(): _register_function(lang_code_3to2) +def mathml_latex(context, trees): + from librarian.embeds.mathml import MathML + text = MathML(trees[0]).to_latex().data + # Remove invisible multiplications, they produce unwanted spaces. + text = text.replace(u'\u2062', '') + return text + +def reg_mathml_latex(): + _register_function(mathml_latex) + +def reg_mathml_epub(zipf): + from librarian.embeds.mathml import MathML + def mathml(context, trees): + data = MathML(trees[0]).to_latex().to_png().data + name = "math%d.png" % mathml.count + mathml.count += 1 + zipf.writestr('OPS/' + name, data) + return name + mathml.count = 0 + _register_function(mathml) + diff --git a/librarian/pdf.py b/librarian/pdf.py index 12c07ea..95883e1 100644 --- a/librarian/pdf.py +++ b/librarian/pdf.py @@ -95,6 +95,17 @@ def fix_hanging(doc): exclude=[DCNS("identifier.url"), DCNS("rights.license")] ) +def fix_tables(doc): + for kol in doc.iter(tag='kol'): + if kol.tail is not None: + if not kol.tail.strip(): + kol.tail = None + for table in doc.iter(tag='tabela'): + if table.get('ramka') == '1' or table.get('ramki') == '1': + table.set('_format', '|' + 'X|' * len(table[0])) + else: + table.set('_format', 'X' * len(table[0])) + def move_motifs_inside(doc): """ moves motifs to be into block elements """ @@ -245,10 +256,12 @@ def transform(wldoc, verbose=False, save_tex=None, morefloats=None, parse_creator(document.edoc) substitute_hyphens(document.edoc) fix_hanging(document.edoc) + fix_tables(document.edoc) # wl -> TeXML style_filename = get_stylesheet("wl2tex") style = etree.parse(style_filename) + functions.reg_mathml_latex() # TeXML -> LaTeX temp = mkdtemp('-wl2pdf') diff --git a/librarian/pdf/wl.cls b/librarian/pdf/wl.cls index 8907b08..a802e20 100644 --- a/librarian/pdf/wl.cls +++ b/librarian/pdf/wl.cls @@ -73,6 +73,11 @@ \usepackage{xunicode} \usepackage{xltxtra} +\usepackage{longtable} +\usepackage{tabu} +\usepackage{unicode-math} +\setmathfont{Latin Modern Math} + \usepackage[overload]{textcase} \usepackage{scalefnt} \usepackage[colorlinks=true,linkcolor=black,setpagesize=false,urlcolor=black,xetex]{hyperref} diff --git a/librarian/pdf/wl2tex.xslt b/librarian/pdf/wl2tex.xslt index d39b61a..2548abc 100644 --- a/librarian/pdf/wl2tex.xslt +++ b/librarian/pdf/wl2tex.xslt @@ -435,6 +435,59 @@ + + + $ + + $ + + + + + + $$ + + $$ + + + + + + 1em + + to \textwidth + + + + + + + + + + + + + + 1em + + + + + + + + + + + + + + + + + + diff --git a/librarian/res/embeds/latex/template.tex b/librarian/res/embeds/latex/template.tex new file mode 100644 index 0000000..8e4b807 --- /dev/null +++ b/librarian/res/embeds/latex/template.tex @@ -0,0 +1,9 @@ +\documentclass{article} +\usepackage{unicode-math} +\setmathfont{Latin Modern Math} +\pagestyle{empty} +\begin{document} + +$%(code)s$ + +\end{document} diff --git a/librarian/res/embeds/mathml/mathml2latex.xslt b/librarian/res/embeds/mathml/mathml2latex.xslt new file mode 100644 index 0000000..76ccf95 --- /dev/null +++ b/librarian/res/embeds/mathml/mathml2latex.xslt @@ -0,0 +1,66 @@ + + + + + + \textrm{ + + } + + + + + + + + + + + + + + + + { + + }^{ + + } + + + + { + + }_{ + + } + + + + { + + } + + + + ( + + ) + + + + \frac{ + + }{ + + } + + + + \varepsilon + + + diff --git a/librarian/xslt/book2html.xslt b/librarian/xslt/book2html.xslt index 499a1dc..201381c 100644 --- a/librarian/xslt/book2html.xslt +++ b/librarian/xslt/book2html.xslt @@ -231,6 +231,20 @@

+ +
+
+ + + + + + + + + + + @@ -244,7 +258,7 @@ - + diff --git a/librarian/xslt/book2txt.xslt b/librarian/xslt/book2txt.xslt index 317e581..a578492 100644 --- a/librarian/xslt/book2txt.xslt +++ b/librarian/xslt/book2txt.xslt @@ -233,6 +233,24 @@ + + + + + + + + + + + + + + + + + + diff --git a/setup.py b/setup.py index 732f145..10abe6e 100755 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ setup( maintainer='Radek Czajka', maintainer_email='radoslaw.czajka@nowoczesnapolska.org.pl', url='http://github.com/fnp/librarian', - packages=['librarian'], + packages=['librarian', 'librarian.embeds'], package_data={'librarian': ['xslt/*.xslt', 'epub/*', 'mobi/*', 'pdf/*', 'fb2/*', 'fonts/*'] + whole_tree(os.path.join(os.path.dirname(__file__), 'librarian'), 'res') + whole_tree(os.path.join(os.path.dirname(__file__), 'librarian'), 'font-optimizer')}, -- 2.20.1 From 141733d3db8c11f1eb69a9c0195f07c3c2ed3f8f Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Thu, 16 Apr 2015 13:20:40 +0200 Subject: [PATCH 15/16] Minor fixups. --- librarian/pdf/wl2tex.xslt | 2 +- librarian/res/embeds/mathml/mathml2latex.xslt | 20 +++++++++---------- librarian/xslt/book2html.xslt | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/librarian/pdf/wl2tex.xslt b/librarian/pdf/wl2tex.xslt index 2548abc..4d7ff03 100644 --- a/librarian/pdf/wl2tex.xslt +++ b/librarian/pdf/wl2tex.xslt @@ -36,7 +36,7 @@ - \usepackage[maxfloats=64]{morefloats} + \usepackage[maxfloats=53]{morefloats} diff --git a/librarian/res/embeds/mathml/mathml2latex.xslt b/librarian/res/embeds/mathml/mathml2latex.xslt index 76ccf95..92f60fc 100644 --- a/librarian/res/embeds/mathml/mathml2latex.xslt +++ b/librarian/res/embeds/mathml/mathml2latex.xslt @@ -5,25 +5,25 @@ xmlns:mml="http://www.w3.org/1998/Math/MathML"> - + \textrm{ } - + - + - + - + { }^{ @@ -31,7 +31,7 @@ xmlns:mml="http://www.w3.org/1998/Math/MathML"> } - + { }_{ @@ -39,19 +39,19 @@ xmlns:mml="http://www.w3.org/1998/Math/MathML"> } - + { } - + ( ) - + \frac{ }{ @@ -59,7 +59,7 @@ xmlns:mml="http://www.w3.org/1998/Math/MathML"> } - + \varepsilon diff --git a/librarian/xslt/book2html.xslt b/librarian/xslt/book2html.xslt index 201381c..9a2b771 100644 --- a/librarian/xslt/book2html.xslt +++ b/librarian/xslt/book2html.xslt @@ -242,7 +242,7 @@ - + @@ -258,7 +258,7 @@ - + -- 2.20.1 From ff2a09e9ecd8e9bede2d3572942bcd32f66f6198 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Wed, 6 May 2015 14:03:20 +0200 Subject: [PATCH 16/16] FB2 footnotes fix. --- librarian/fb2/fb2.xslt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/librarian/fb2/fb2.xslt b/librarian/fb2/fb2.xslt index 950b526..2f322e8 100644 --- a/librarian/fb2/fb2.xslt +++ b/librarian/fb2/fb2.xslt @@ -26,7 +26,7 @@ - + -- 2.20.1