X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/bddf82ce5ec2a7c34b6b14d3bdb6c4cf24af7388..fefdce4e24f9e397df5538fe6e7f54b5ece4d841:/librarian/hyphenator.py diff --git a/librarian/hyphenator.py b/librarian/hyphenator.py deleted file mode 100644 index 18d402b..0000000 --- a/librarian/hyphenator.py +++ /dev/null @@ -1,239 +0,0 @@ -""" - -This is a Pure Python module to hyphenate text. - -It is inspired by Ruby's Text::Hyphen, but currently reads standard *.dic files, -that must be installed separately. - -In the future it's maybe nice if dictionaries could be distributed together with -this module, in a slightly prepared form, like in Ruby's Text::Hyphen. - -Wilbert Berendsen, March 2008 -info@wilbertberendsen.nl - -License: LGPL. - -""" - -import sys -import re - -__all__ = ("Hyphenator") - -# cache of per-file Hyph_dict objects -hdcache = {} - -# precompile some stuff -parse_hex = re.compile(r'\^{2}([0-9a-f]{2})').sub -parse = re.compile(r'(\d?)(\D?)').findall - -def hexrepl(matchObj): - return unichr(int(matchObj.group(1), 16)) - - -class parse_alt(object): - """ - Parse nonstandard hyphen pattern alternative. - The instance returns a special int with data about the current position - in the pattern when called with an odd value. - """ - def __init__(self, pat, alt): - alt = alt.split(',') - self.change = alt[0] - if len(alt) > 2: - self.index = int(alt[1]) - self.cut = int(alt[2]) + 1 - else: - self.index = 1 - self.cut = len(re.sub(r'[\d\.]', '', pat)) + 1 - if pat.startswith('.'): - self.index += 1 - - def __call__(self, val): - self.index -= 1 - val = int(val) - if val & 1: - return dint(val, (self.change, self.index, self.cut)) - else: - return val - - -class dint(int): - """ - Just an int some other data can be stuck to in a data attribute. - Call with ref=other to use the data from the other dint. - """ - def __new__(cls, value, data=None, ref=None): - obj = int.__new__(cls, value) - if ref and type(ref) == dint: - obj.data = ref.data - else: - obj.data = data - return obj - - -class Hyph_dict(object): - """ - Reads a hyph_*.dic file and stores the hyphenation patterns. - Parameters: - -filename : filename of hyph_*.dic to read - """ - def __init__(self, filename): - self.patterns = {} - f = open(filename) - charset = f.readline().strip() - if charset.startswith('charset '): - charset = charset[8:].strip() - - for pat in f: - pat = pat.decode(charset).strip() - if not pat or pat[0] == '%': continue - # replace ^^hh with the real character - pat = parse_hex(hexrepl, pat) - # read nonstandard hyphen alternatives - if '/' in pat: - pat, alt = pat.split('/', 1) - factory = parse_alt(pat, alt) - else: - factory = int - tag, value = zip(*[(s, factory(i or "0")) for i, s in parse(pat)]) - # if only zeros, skip this pattern - if max(value) == 0: continue - # chop zeros from beginning and end, and store start offset. - start, end = 0, len(value) - while not value[start]: start += 1 - while not value[end-1]: end -= 1 - self.patterns[''.join(tag)] = start, value[start:end] - f.close() - self.cache = {} - self.maxlen = max(map(len, self.patterns.keys())) - - def positions(self, word): - """ - Returns a list of positions where the word can be hyphenated. - E.g. for the dutch word 'lettergrepen' this method returns - the list [3, 6, 9]. - - Each position is a 'data int' (dint) with a data attribute. - If the data attribute is not None, it contains a tuple with - information about nonstandard hyphenation at that point: - (change, index, cut) - - change: is a string like 'ff=f', that describes how hyphenation - should take place. - index: where to substitute the change, counting from the current - point - cut: how many characters to remove while substituting the nonstandard - hyphenation - """ - word = word.lower() - points = self.cache.get(word) - if points is None: - prepWord = '.%s.' % word - res = [0] * (len(prepWord) + 1) - for i in range(len(prepWord) - 1): - for j in range(i + 1, min(i + self.maxlen, len(prepWord)) + 1): - p = self.patterns.get(prepWord[i:j]) - if p: - offset, value = p - s = slice(i + offset, i + offset + len(value)) - res[s] = map(max, value, res[s]) - - points = [dint(i - 1, ref=r) for i, r in enumerate(res) if r % 2] - self.cache[word] = points - return points - - -class Hyphenator(object): - """ - Reads a hyph_*.dic file and stores the hyphenation patterns. - Provides methods to hyphenate strings in various ways. - Parameters: - -filename : filename of hyph_*.dic to read - -left: make the first syllabe not shorter than this - -right: make the last syllabe not shorter than this - -cache: if true (default), use a cached copy of the dic file, if possible - - left and right may also later be changed: - h = Hyphenator(file) - h.left = 1 - """ - def __init__(self, filename, left=2, right=2, cache=True): - self.left = left - self.right = right - if not cache or filename not in hdcache: - hdcache[filename] = Hyph_dict(filename) - self.hd = hdcache[filename] - - def positions(self, word): - """ - Returns a list of positions where the word can be hyphenated. - See also Hyph_dict.positions. The points that are too far to - the left or right are removed. - """ - right = len(word) - self.right - return [i for i in self.hd.positions(word) if self.left <= i <= right] - - def iterate(self, word): - """ - Iterate over all hyphenation possibilities, the longest first. - """ - if isinstance(word, str): - word = word.decode('latin1') - for p in reversed(self.positions(word)): - if p.data: - # get the nonstandard hyphenation data - change, index, cut = p.data - if word.isupper(): - change = change.upper() - c1, c2 = change.split('=') - yield word[:p+index] + c1, c2 + word[p+index+cut:] - else: - yield word[:p], word[p:] - - def wrap(self, word, width, hyphen='-'): - """ - Return the longest possible first part and the last part of the - hyphenated word. The first part has the hyphen already attached. - Returns None, if there is no hyphenation point before width, or - if the word could not be hyphenated. - """ - width -= len(hyphen) - for w1, w2 in self.iterate(word): - if len(w1) <= width: - return w1 + hyphen, w2 - - def inserted(self, word, hyphen='-'): - """ - Returns the word as a string with all the possible hyphens inserted. - E.g. for the dutch word 'lettergrepen' this method returns - the string 'let-ter-gre-pen'. The hyphen string to use can be - given as the second parameter, that defaults to '-'. - """ - if isinstance(word, str): - word = word.decode('latin1') - l = list(word) - for p in reversed(self.positions(word)): - if p.data: - # get the nonstandard hyphenation data - change, index, cut = p.data - if word.isupper(): - change = change.upper() - l[p + index : p + index + cut] = change.replace('=', hyphen) - else: - l.insert(p, hyphen) - return ''.join(l) - - __call__ = iterate - - -if __name__ == "__main__": - - dict_file = sys.argv[1] - word = sys.argv[2].decode('latin1') - - h = Hyphenator(dict_file, left=1, right=1) - - for i in h(word): - print i -