src/librarian/functions.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import unicode_literals
   7
   8 from lxml import etree
   9 import re
  10 from ebooklib import epub
  11
  12 from librarian.dcparser import Person
  13 from librarian import get_resource
  14
  15
  16 def _register_function(f):
  17     """ Register extension function with lxml """
  18     ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
  19     ns[f.__name__] = f
  20
  21
  22 def reg_substitute_entities():
  23     entity_substitutions = [
  24         (u'---', u'—'),
  25         (u'--', u'–'),
  26         (u'...', u'…'),
  27         (u',,', u'„'),
  28         (u'"', u'”'),
  29     ]
  30
  31     def substitute_entities(context, text):
  32         """XPath extension function converting all entites in passed text."""
  33         if isinstance(text, list):
  34             text = ''.join(text)
  35         for entity, substitutution in entity_substitutions:
  36             text = text.replace(entity, substitutution)
  37         return text
  38
  39     _register_function(substitute_entities)
  40
  41
  42 def reg_strip():
  43     def strip(context, text):
  44         """Remove unneeded whitespace from beginning and end"""
  45         if isinstance(text, list):
  46             text = ''.join(text)
  47         return re.sub(r'\s+', ' ', text).strip()
  48     _register_function(strip)
  49
  50
  51 def reg_starts_white():
  52     def starts_white(context, text):
  53         if isinstance(text, list):
  54             text = ''.join(text)
  55         if not text:
  56             return False
  57         return text[0].isspace()
  58     _register_function(starts_white)
  59
  60
  61 def reg_ends_white():
  62     def ends_white(context, text):
  63         if isinstance(text, list):
  64             text = ''.join(text)
  65         if not text:
  66             return False
  67         return text[-1].isspace()
  68     _register_function(ends_white)
  69
  70
  71 def reg_wrap_words():
  72     def wrap_words(context, text, wrapping):
  73         """
  74         XPath extension function automatically wrapping words
  75         in passed text.
  76         """
  77         if isinstance(text, list):
  78             text = ''.join(text)
  79         if not wrapping:
  80             return text
  81
  82         words = re.split(r'\s', text)
  83
  84         line_length = 0
  85         lines = [[]]
  86         for word in words:
  87             line_length += len(word) + 1
  88             if line_length > wrapping:
  89                 # Max line length was exceeded. We create new line
  90                 lines.append([])
  91                 line_length = len(word)
  92             lines[-1].append(word)
  93         return '\n'.join(' '.join(line) for line in lines)
  94     _register_function(wrap_words)
  95
  96
  97 def reg_person_name():
  98     def person_name(context, text):
  99         """ Converts "Name, Forename" to "Forename Name" """
 100         if isinstance(text, list):
 101             text = ''.join(text)
 102         return Person.from_text(text).readable()
 103     _register_function(person_name)
 104
 105
 106 def reg_texcommand():
 107     def texcommand(context, text):
 108         """Remove non-letters"""
 109         if isinstance(text, list):
 110             text = ''.join(text)
 111         return re.sub(r'[^a-zA-Z]', '', text).strip()
 112     _register_function(texcommand)
 113
 114
 115 def lang_code_3to2(text):
 116     """Convert 3-letter language code to 2-letter code"""
 117     result = ''
 118     text = ''.join(text)
 119     with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
 120         for line in f.read().decode('latin1').split('\n'):
 121             codes = line.strip().split('|')
 122             if codes[0] == text:
 123                 result = codes[2]
 124     if result == '':
 125         return text
 126     else:
 127         return result
 128
 129
 130 def mathml_latex(context, trees):
 131     from librarian.embeds.mathml import MathML
 132     text = MathML(trees[0]).to_latex().data
 133     # Remove invisible multiplications, they produce unwanted spaces.
 134     text = text.replace(u'\u2062', '')
 135     return text
 136
 137
 138 def reg_mathml_latex():
 139     _register_function(mathml_latex)
 140
 141
 142 def reg_mathml_epub(output):
 143     from librarian.embeds.mathml import MathML
 144
 145     def mathml(context, trees):
 146         data = MathML(trees[0]).to_latex().to_png().data
 147         name = "math%d.png" % mathml.count
 148         mathml.count += 1
 149         output.add_item(
 150             epub.EpubItem(
 151                 uid='math%d' % mathml.count,
 152                 file_name=name,
 153                 media_type='image/png',
 154                 content=data
 155             )
 156         )
 157
 158         return name
 159     mathml.count = 0
 160     _register_function(mathml)