3 This is a Pure Python module to hyphenate text.
5 It is inspired by Ruby's Text::Hyphen, but currently reads standard *.dic files,
6 that must be installed separately.
8 In the future it's maybe nice if dictionaries could be distributed together with
9 this module, in a slightly prepared form, like in Ruby's Text::Hyphen.
11 Wilbert Berendsen, March 2008
12 info@wilbertberendsen.nl
21 __all__ = ("Hyphenator")
23 # cache of per-file Hyph_dict objects
26 # precompile some stuff
27 parse_hex = re.compile(r'\^{2}([0-9a-f]{2})').sub
28 parse = re.compile(r'(\d?)(\D?)').findall
30 def hexrepl(matchObj):
31 return unichr(int(matchObj.group(1), 16))
34 class parse_alt(object):
36 Parse nonstandard hyphen pattern alternative.
37 The instance returns a special int with data about the current position
38 in the pattern when called with an odd value.
40 def __init__(self, pat, alt):
44 self.index = int(alt[1])
45 self.cut = int(alt[2]) + 1
48 self.cut = len(re.sub(r'[\d\.]', '', pat)) + 1
49 if pat.startswith('.'):
52 def __call__(self, val):
56 return dint(val, (self.change, self.index, self.cut))
63 Just an int some other data can be stuck to in a data attribute.
64 Call with ref=other to use the data from the other dint.
66 def __new__(cls, value, data=None, ref=None):
67 obj = int.__new__(cls, value)
68 if ref and type(ref) == dint:
75 class Hyph_dict(object):
77 Reads a hyph_*.dic file and stores the hyphenation patterns.
79 -filename : filename of hyph_*.dic to read
81 def __init__(self, filename):
84 charset = f.readline().strip()
85 if charset.startswith('charset '):
86 charset = charset[8:].strip()
89 pat = pat.decode(charset).strip()
90 if not pat or pat[0] == '%': continue
91 # replace ^^hh with the real character
92 pat = parse_hex(hexrepl, pat)
93 # read nonstandard hyphen alternatives
95 pat, alt = pat.split('/', 1)
96 factory = parse_alt(pat, alt)
99 tag, value = zip(*[(s, factory(i or "0")) for i, s in parse(pat)])
100 # if only zeros, skip this pattern
101 if max(value) == 0: continue
102 # chop zeros from beginning and end, and store start offset.
103 start, end = 0, len(value)
104 while not value[start]: start += 1
105 while not value[end-1]: end -= 1
106 self.patterns[''.join(tag)] = start, value[start:end]
109 self.maxlen = max(map(len, self.patterns.keys()))
111 def positions(self, word):
113 Returns a list of positions where the word can be hyphenated.
114 E.g. for the dutch word 'lettergrepen' this method returns
117 Each position is a 'data int' (dint) with a data attribute.
118 If the data attribute is not None, it contains a tuple with
119 information about nonstandard hyphenation at that point:
122 change: is a string like 'ff=f', that describes how hyphenation
124 index: where to substitute the change, counting from the current
126 cut: how many characters to remove while substituting the nonstandard
130 points = self.cache.get(word)
132 prepWord = '.%s.' % word
133 res = [0] * (len(prepWord) + 1)
134 for i in range(len(prepWord) - 1):
135 for j in range(i + 1, min(i + self.maxlen, len(prepWord)) + 1):
136 p = self.patterns.get(prepWord[i:j])
139 s = slice(i + offset, i + offset + len(value))
140 res[s] = map(max, value, res[s])
142 points = [dint(i - 1, ref=r) for i, r in enumerate(res) if r % 2]
143 self.cache[word] = points
147 class Hyphenator(object):
149 Reads a hyph_*.dic file and stores the hyphenation patterns.
150 Provides methods to hyphenate strings in various ways.
152 -filename : filename of hyph_*.dic to read
153 -left: make the first syllabe not shorter than this
154 -right: make the last syllabe not shorter than this
155 -cache: if true (default), use a cached copy of the dic file, if possible
157 left and right may also later be changed:
161 def __init__(self, filename, left=2, right=2, cache=True):
164 if not cache or filename not in hdcache:
165 hdcache[filename] = Hyph_dict(filename)
166 self.hd = hdcache[filename]
168 def positions(self, word):
170 Returns a list of positions where the word can be hyphenated.
171 See also Hyph_dict.positions. The points that are too far to
172 the left or right are removed.
174 right = len(word) - self.right
175 return [i for i in self.hd.positions(word) if self.left <= i <= right]
177 def iterate(self, word):
179 Iterate over all hyphenation possibilities, the longest first.
181 if isinstance(word, str):
182 word = word.decode('latin1')
183 for p in reversed(self.positions(word)):
185 # get the nonstandard hyphenation data
186 change, index, cut = p.data
188 change = change.upper()
189 c1, c2 = change.split('=')
190 yield word[:p+index] + c1, c2 + word[p+index+cut:]
192 yield word[:p], word[p:]
194 def wrap(self, word, width, hyphen='-'):
196 Return the longest possible first part and the last part of the
197 hyphenated word. The first part has the hyphen already attached.
198 Returns None, if there is no hyphenation point before width, or
199 if the word could not be hyphenated.
202 for w1, w2 in self.iterate(word):
204 return w1 + hyphen, w2
206 def inserted(self, word, hyphen='-'):
208 Returns the word as a string with all the possible hyphens inserted.
209 E.g. for the dutch word 'lettergrepen' this method returns
210 the string 'let-ter-gre-pen'. The hyphen string to use can be
211 given as the second parameter, that defaults to '-'.
213 if isinstance(word, str):
214 word = word.decode('latin1')
216 for p in reversed(self.positions(word)):
218 # get the nonstandard hyphenation data
219 change, index, cut = p.data
221 change = change.upper()
222 l[p + index : p + index + cut] = change.replace('=', hyphen)
230 if __name__ == "__main__":
232 dict_file = sys.argv[1]
233 word = sys.argv[2].decode('latin1')
235 h = Hyphenator(dict_file, left=1, right=1)