librarian/functions.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from lxml import etree
   7 import re
   8
   9 from librarian.dcparser import Person
  10 from librarian import get_resource
  11
  12
  13 def _register_function(f):
  14     """ Register extension function with lxml """
  15     ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
  16     ns[f.__name__] = f
  17
  18
  19 def reg_substitute_entities():
  20     entity_substitutions = [
  21         (u'---', u'—'),
  22         (u'--', u'–'),
  23         (u'...', u'…'),
  24         (u',,', u'„'),
  25         (u'"', u'”'),
  26     ]
  27
  28     def substitute_entities(context, text):
  29         """XPath extension function converting all entites in passed text."""
  30         if isinstance(text, list):
  31             text = ''.join(text)
  32         for entity, substitutution in entity_substitutions:
  33             text = text.replace(entity, substitutution)
  34         return text
  35
  36     _register_function(substitute_entities)
  37
  38
  39 def reg_strip():
  40     def strip(context, text):
  41         """Remove unneeded whitespace from beginning and end"""
  42         if isinstance(text, list):
  43             text = ''.join(text)
  44         return re.sub(r'\s+', ' ', text).strip()
  45     _register_function(strip)
  46
  47
  48 def reg_starts_white():
  49     def starts_white(context, text):
  50         if isinstance(text, list):
  51             text = ''.join(text)
  52         if not text:
  53             return False
  54         return text[0].isspace()
  55     _register_function(starts_white)
  56
  57
  58 def reg_ends_white():
  59     def ends_white(context, text):
  60         if isinstance(text, list):
  61             text = ''.join(text)
  62         if not text:
  63             return False
  64         return text[-1].isspace()
  65     _register_function(ends_white)
  66
  67
  68 def reg_wrap_words():
  69     def wrap_words(context, text, wrapping):
  70         """XPath extension function automatically wrapping words in passed text"""
  71         if isinstance(text, list):
  72             text = ''.join(text)
  73         if not wrapping:
  74             return text
  75
  76         words = re.split(r'\s', text)
  77
  78         line_length = 0
  79         lines = [[]]
  80         for word in words:
  81             line_length += len(word) + 1
  82             if line_length > wrapping:
  83                 # Max line length was exceeded. We create new line
  84                 lines.append([])
  85                 line_length = len(word)
  86             lines[-1].append(word)
  87         return '\n'.join(' '.join(line) for line in lines)
  88     _register_function(wrap_words)
  89
  90
  91 def reg_person_name():
  92     def person_name(context, text):
  93         """ Converts "Name, Forename" to "Forename Name" """
  94         if isinstance(text, list):
  95             text = ''.join(text)
  96         return Person.from_text(text).readable()
  97     _register_function(person_name)
  98
  99
 100 def reg_texcommand():
 101     def texcommand(context, text):
 102         """Remove non-letters"""
 103         if isinstance(text, list):
 104             text = ''.join(text)
 105         return re.sub(r'[^a-zA-Z]', '', text).strip()
 106     _register_function(texcommand)
 107
 108
 109 def reg_lang_code_3to2():
 110     def lang_code_3to2(context, text):
 111         """Convert 3-letter language code to 2-letter code"""
 112         result = ''
 113         text = ''.join(text)
 114         with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
 115             for line in f:
 116                 list = line.strip().split('|')
 117                 if list[0] == text:
 118                     result = list[2]
 119         if result == '':
 120             return text
 121         else:
 122             return result
 123     _register_function(lang_code_3to2)
 124
 125
 126 def mathml_latex(context, trees):
 127     from librarian.embeds.mathml import MathML
 128     text = MathML(trees[0]).to_latex().data
 129     # Remove invisible multiplications, they produce unwanted spaces.
 130     text = text.replace(u'\u2062', '')
 131     return text
 132
 133
 134 def reg_mathml_latex():
 135     _register_function(mathml_latex)
 136
 137
 138 def reg_mathml_epub(zipf):
 139     from librarian.embeds.mathml import MathML
 140
 141     def mathml(context, trees):
 142         data = MathML(trees[0]).to_latex().to_png().data
 143         name = "math%d.png" % mathml.count
 144         mathml.count += 1
 145         zipf.writestr('OPS/' + name, data)
 146         return name
 147     mathml.count = 0
 148     _register_function(mathml)