librarian/hyphenator.py

   1 """
   2
   3 This is a Pure Python module to hyphenate text.
   4
   5 It is inspired by Ruby's Text::Hyphen, but currently reads standard *.dic files,
   6 that must be installed separately.
   7
   8 In the future it's maybe nice if dictionaries could be distributed together with
   9 this module, in a slightly prepared form, like in Ruby's Text::Hyphen.
  10
  11 Wilbert Berendsen, March 2008
  12 info@wilbertberendsen.nl
  13
  14 License: LGPL.
  15
  16 """
  17
  18 import sys
  19 import re
  20
  21 __all__ = ("Hyphenator")
  22
  23 # cache of per-file Hyph_dict objects
  24 hdcache = {}
  25
  26 # precompile some stuff
  27 parse_hex = re.compile(r'\^{2}([0-9a-f]{2})').sub
  28 parse = re.compile(r'(\d?)(\D?)').findall
  29
  30 def hexrepl(matchObj):
  31     return unichr(int(matchObj.group(1), 16))
  32
  33
  34 class parse_alt(object):
  35     """
  36     Parse nonstandard hyphen pattern alternative.
  37     The instance returns a special int with data about the current position
  38     in the pattern when called with an odd value.
  39     """
  40     def __init__(self, pat, alt):
  41         alt = alt.split(',')
  42         self.change = alt[0]
  43         if len(alt) > 2:
  44             self.index = int(alt[1])
  45             self.cut = int(alt[2]) + 1
  46         else:
  47             self.index = 1
  48             self.cut = len(re.sub(r'[\d\.]', '', pat)) + 1
  49         if pat.startswith('.'):
  50             self.index += 1
  51
  52     def __call__(self, val):
  53         self.index -= 1
  54         val = int(val)
  55         if val & 1:
  56             return dint(val, (self.change, self.index, self.cut))
  57         else:
  58             return val
  59
  60
  61 class dint(int):
  62     """
  63     Just an int some other data can be stuck to in a data attribute.
  64     Call with ref=other to use the data from the other dint.
  65     """
  66     def __new__(cls, value, data=None, ref=None):
  67         obj = int.__new__(cls, value)
  68         if ref and type(ref) == dint:
  69             obj.data = ref.data
  70         else:
  71             obj.data = data
  72         return obj
  73
  74
  75 class Hyph_dict(object):
  76     """
  77     Reads a hyph_*.dic file and stores the hyphenation patterns.
  78     Parameters:
  79     -filename : filename of hyph_*.dic to read
  80     """
  81     def __init__(self, filename):
  82         self.patterns = {}
  83         f = open(filename)
  84         charset = f.readline().strip()
  85         if charset.startswith('charset '):
  86             charset = charset[8:].strip()
  87
  88         for pat in f:
  89             pat = pat.decode(charset).strip()
  90             if not pat or pat[0] == '%': continue
  91             # replace ^^hh with the real character
  92             pat = parse_hex(hexrepl, pat)
  93             # read nonstandard hyphen alternatives
  94             if '/' in pat:
  95                 pat, alt = pat.split('/', 1)
  96                 factory = parse_alt(pat, alt)
  97             else:
  98                 factory = int
  99             tag, value = zip(*[(s, factory(i or "0")) for i, s in parse(pat)])
 100             # if only zeros, skip this pattern
 101             if max(value) == 0: continue
 102             # chop zeros from beginning and end, and store start offset.
 103             start, end = 0, len(value)
 104             while not value[start]: start += 1
 105             while not value[end-1]: end -= 1
 106             self.patterns[''.join(tag)] = start, value[start:end]
 107         f.close()
 108         self.cache = {}
 109         self.maxlen = max(map(len, self.patterns.keys()))
 110
 111     def positions(self, word):
 112         """
 113         Returns a list of positions where the word can be hyphenated.
 114         E.g. for the dutch word 'lettergrepen' this method returns
 115         the list [3, 6, 9].
 116
 117         Each position is a 'data int' (dint) with a data attribute.
 118         If the data attribute is not None, it contains a tuple with
 119         information about nonstandard hyphenation at that point:
 120         (change, index, cut)
 121
 122         change: is a string like 'ff=f', that describes how hyphenation
 123             should take place.
 124         index: where to substitute the change, counting from the current
 125             point
 126         cut: how many characters to remove while substituting the nonstandard
 127             hyphenation
 128         """
 129         word = word.lower()
 130         points = self.cache.get(word)
 131         if points is None:
 132             prepWord = '.%s.' % word
 133             res = [0] * (len(prepWord) + 1)
 134             for i in range(len(prepWord) - 1):
 135                 for j in range(i + 1, min(i + self.maxlen, len(prepWord)) + 1):
 136                     p = self.patterns.get(prepWord[i:j])
 137                     if p:
 138                         offset, value = p
 139                         s = slice(i + offset, i + offset + len(value))
 140                         res[s] = map(max, value, res[s])
 141
 142             points = [dint(i - 1, ref=r) for i, r in enumerate(res) if r % 2]
 143             self.cache[word] = points
 144         return points
 145
 146
 147 class Hyphenator(object):
 148     """
 149     Reads a hyph_*.dic file and stores the hyphenation patterns.
 150     Provides methods to hyphenate strings in various ways.
 151     Parameters:
 152     -filename : filename of hyph_*.dic to read
 153     -left: make the first syllabe not shorter than this
 154     -right: make the last syllabe not shorter than this
 155     -cache: if true (default), use a cached copy of the dic file, if possible
 156
 157     left and right may also later be changed:
 158       h = Hyphenator(file)
 159       h.left = 1
 160     """
 161     def __init__(self, filename, left=2, right=2, cache=True):
 162         self.left  = left
 163         self.right = right
 164         if not cache or filename not in hdcache:
 165             hdcache[filename] = Hyph_dict(filename)
 166         self.hd = hdcache[filename]
 167
 168     def positions(self, word):
 169         """
 170         Returns a list of positions where the word can be hyphenated.
 171         See also Hyph_dict.positions. The points that are too far to
 172         the left or right are removed.
 173         """
 174         right = len(word) - self.right
 175         return [i for i in self.hd.positions(word) if self.left <= i <= right]
 176
 177     def iterate(self, word):
 178         """
 179         Iterate over all hyphenation possibilities, the longest first.
 180         """
 181         if isinstance(word, str):
 182             word = word.decode('latin1')
 183         for p in reversed(self.positions(word)):
 184             if p.data:
 185                 # get the nonstandard hyphenation data
 186                 change, index, cut = p.data
 187                 if word.isupper():
 188                     change = change.upper()
 189                 c1, c2 = change.split('=')
 190                 yield word[:p+index] + c1, c2 + word[p+index+cut:]
 191             else:
 192                 yield word[:p], word[p:]
 193
 194     def wrap(self, word, width, hyphen='-'):
 195         """
 196         Return the longest possible first part and the last part of the
 197         hyphenated word. The first part has the hyphen already attached.
 198         Returns None, if there is no hyphenation point before width, or
 199         if the word could not be hyphenated.
 200         """
 201         width -= len(hyphen)
 202         for w1, w2 in self.iterate(word):
 203             if len(w1) <= width:
 204                 return w1 + hyphen, w2
 205
 206     def inserted(self, word, hyphen='-'):
 207         """
 208         Returns the word as a string with all the possible hyphens inserted.
 209         E.g. for the dutch word 'lettergrepen' this method returns
 210         the string 'let-ter-gre-pen'. The hyphen string to use can be
 211         given as the second parameter, that defaults to '-'.
 212         """
 213         if isinstance(word, str):
 214             word = word.decode('latin1')
 215         l = list(word)
 216         for p in reversed(self.positions(word)):
 217             if p.data:
 218                 # get the nonstandard hyphenation data
 219                 change, index, cut = p.data
 220                 if word.isupper():
 221                     change = change.upper()
 222                 l[p + index : p + index + cut] = change.replace('=', hyphen)
 223             else:
 224                 l.insert(p, hyphen)
 225         return ''.join(l)
 226
 227     __call__ = iterate
 228
 229
 230 if __name__ == "__main__":
 231
 232     dict_file = sys.argv[1]
 233     word = sys.argv[2].decode('latin1')
 234
 235     h = Hyphenator(dict_file, left=1, right=1)
 236
 237     for i in h(word):
 238         print i
 239