librarian/functions.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from lxml import etree
   7 import re
   8
   9 from librarian.dcparser import Person
  10
  11 def _register_function(f):
  12     """ Register extension function with lxml """
  13     ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
  14     ns[f.__name__] = f
  15
  16
  17 def reg_substitute_entities():
  18     ENTITY_SUBSTITUTIONS = [
  19         (u'---', u'—'),
  20         (u'--', u'–'),
  21         (u'...', u'…'),
  22         (u',,', u'„'),
  23         (u'"', u'”'),
  24     ]
  25
  26     def substitute_entities(context, text):
  27         """XPath extension function converting all entites in passed text."""
  28         if isinstance(text, list):
  29             text = ''.join(text)
  30         for entity, substitutution in ENTITY_SUBSTITUTIONS:
  31             text = text.replace(entity, substitutution)
  32         return text
  33
  34     _register_function(substitute_entities)
  35
  36
  37 def reg_strip():
  38     def strip(context, text):
  39         """Remove unneeded whitespace from beginning and end"""
  40         if isinstance(text, list):
  41             text = ''.join(text)
  42         return re.sub(r'\s+', ' ', text).strip()
  43     _register_function(strip)
  44
  45
  46 def reg_starts_white():
  47     def starts_white(context, text):
  48         if isinstance(text, list):
  49             text = ''.join(text)
  50         if not text:
  51             return False
  52         return text[0].isspace()
  53     _register_function(starts_white)
  54
  55
  56 def reg_ends_white():
  57     def ends_white(context, text):
  58         if isinstance(text, list):
  59             text = ''.join(text)
  60         if not text:
  61             return False
  62         return text[-1].isspace()
  63     _register_function(ends_white)
  64
  65
  66 def reg_wrap_words():
  67     def wrap_words(context, text, wrapping):
  68         """XPath extension function automatically wrapping words in passed text"""
  69         if isinstance(text, list):
  70             text = ''.join(text)
  71         if not wrapping:
  72             return text
  73
  74         words = re.split(r'\s', text)
  75
  76         line_length = 0
  77         lines = [[]]
  78         for word in words:
  79             line_length += len(word) + 1
  80             if line_length > wrapping:
  81                 # Max line length was exceeded. We create new line
  82                 lines.append([])
  83                 line_length = len(word)
  84             lines[-1].append(word)
  85         return '\n'.join(' '.join(line) for line in lines)
  86     _register_function(wrap_words)
  87
  88
  89 def reg_person_name():
  90     def person_name(context, text):
  91         """ Converts "Name, Forename" to "Forename Name" """
  92         if isinstance(text, list):
  93             text = ''.join(text)
  94         p = Person.from_text(text)
  95         return ' '.join(p.first_names + (p.last_name,))
  96     _register_function(person_name)
  97
  98
  99 def reg_texcommand():
 100     def texcommand(context, text):
 101         """Remove non-letters"""
 102         if isinstance(text, list):
 103             text = ''.join(text)
 104         return re.sub(r'[^a-zA-Z]', '', text).strip()
 105     _register_function(texcommand)
 106
 107