X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java diff --git a/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java b/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java deleted file mode 100644 index c61a8d0..0000000 --- a/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java +++ /dev/null @@ -1,476 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.analysis.compound.hyphenation; - -import java.io.File; -import java.io.Serializable; -import java.net.MalformedURLException; -import java.util.ArrayList; -import java.util.HashMap; - -import org.xml.sax.InputSource; - -/** - * This tree structure stores the hyphenation patterns in an efficient way for - * fast lookup. It provides the provides the method to hyphenate a word. - * - * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. - */ -public class HyphenationTree extends TernaryTree implements PatternConsumer, - Serializable { - - private static final long serialVersionUID = -7842107987915665573L; - - /** - * value space: stores the interletter values - */ - protected ByteVector vspace; - - /** - * This map stores hyphenation exceptions - */ - protected HashMap> stoplist; - - /** - * This map stores the character classes - */ - protected TernaryTree classmap; - - /** - * Temporary map to store interletter values on pattern loading. - */ - private transient TernaryTree ivalues; - - public HyphenationTree() { - stoplist = new HashMap>(23); // usually a small table - classmap = new TernaryTree(); - vspace = new ByteVector(); - vspace.alloc(1); // this reserves index 0, which we don't use - } - - /** - * Packs the values by storing them in 4 bits, two values into a byte Values - * range is from 0 to 9. We use zero as terminator, so we'll add 1 to the - * value. - * - * @param values a string of digits from '0' to '9' representing the - * interletter values. - * @return the index into the vspace array where the packed values are stored. - */ - protected int packValues(String values) { - int i, n = values.length(); - int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1; - int offset = vspace.alloc(m); - byte[] va = vspace.getArray(); - for (i = 0; i < n; i++) { - int j = i >> 1; - byte v = (byte) ((values.charAt(i) - '0' + 1) & 0x0f); - if ((i & 1) == 1) { - va[j + offset] = (byte) (va[j + offset] | v); - } else { - va[j + offset] = (byte) (v << 4); // big endian - } - } - va[m - 1 + offset] = 0; // terminator - return offset; - } - - protected String unpackValues(int k) { - StringBuilder buf = new StringBuilder(); - byte v = vspace.get(k++); - while (v != 0) { - char c = (char) ((v >>> 4) - 1 + '0'); - buf.append(c); - c = (char) (v & 0x0f); - if (c == 0) { - break; - } - c = (char) (c - 1 + '0'); - buf.append(c); - v = vspace.get(k++); - } - return buf.toString(); - } - - /** - * Read hyphenation patterns from an XML file. - * - * @param f the filename - * @throws HyphenationException In case the parsing fails - */ - public void loadPatterns(File f) throws HyphenationException { - try { - InputSource src = new InputSource(f.toURL().toExternalForm()); - loadPatterns(src); - } catch (MalformedURLException e) { - throw new HyphenationException("Error converting the File '" + f - + "' to a URL: " + e.getMessage()); - } - } - - /** - * Read hyphenation patterns from an XML file. - * - * @param source the InputSource for the file - * @throws HyphenationException In case the parsing fails - */ - public void loadPatterns(InputSource source) throws HyphenationException { - PatternParser pp = new PatternParser(this); - ivalues = new TernaryTree(); - - pp.parse(source); - - // patterns/values should be now in the tree - // let's optimize a bit - trimToSize(); - vspace.trimToSize(); - classmap.trimToSize(); - - // get rid of the auxiliary map - ivalues = null; - } - - public String findPattern(String pat) { - int k = super.find(pat); - if (k >= 0) { - return unpackValues(k); - } - return ""; - } - - /** - * String compare, returns 0 if equal or t is a substring of s - */ - protected int hstrcmp(char[] s, int si, char[] t, int ti) { - for (; s[si] == t[ti]; si++, ti++) { - if (s[si] == 0) { - return 0; - } - } - if (t[ti] == 0) { - return 0; - } - return s[si] - t[ti]; - } - - protected byte[] getValues(int k) { - StringBuilder buf = new StringBuilder(); - byte v = vspace.get(k++); - while (v != 0) { - char c = (char) ((v >>> 4) - 1); - buf.append(c); - c = (char) (v & 0x0f); - if (c == 0) { - break; - } - c = (char) (c - 1); - buf.append(c); - v = vspace.get(k++); - } - byte[] res = new byte[buf.length()]; - for (int i = 0; i < res.length; i++) { - res[i] = (byte) buf.charAt(i); - } - return res; - } - - /** - *

- * Search for all possible partial matches of word starting at index an update - * interletter values. In other words, it does something like: - *

- * - * for(i=0; i - *

- * But it is done in an efficient way since the patterns are stored in a - * ternary tree. In fact, this is the whole purpose of having the tree: doing - * this search without having to test every single pattern. The number of - * patterns for languages such as English range from 4000 to 10000. Thus, - * doing thousands of string comparisons for each word to hyphenate would be - * really slow without the tree. The tradeoff is memory, but using a ternary - * tree instead of a trie, almost halves the the memory used by Lout or TeX. - * It's also faster than using a hash table - *

- * - * @param word null terminated word to match - * @param index start index from word - * @param il interletter values array to update - */ - protected void searchPatterns(char[] word, int index, byte[] il) { - byte[] values; - int i = index; - char p, q; - char sp = word[i]; - p = root; - - while (p > 0 && p < sc.length) { - if (sc[p] == 0xFFFF) { - if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) { - values = getValues(eq[p]); // data pointer is in eq[] - int j = index; - for (int k = 0; k < values.length; k++) { - if (j < il.length && values[k] > il[j]) { - il[j] = values[k]; - } - j++; - } - } - return; - } - int d = sp - sc[p]; - if (d == 0) { - if (sp == 0) { - break; - } - sp = word[++i]; - p = eq[p]; - q = p; - - // look for a pattern ending at this position by searching for - // the null char ( splitchar == 0 ) - while (q > 0 && q < sc.length) { - if (sc[q] == 0xFFFF) { // stop at compressed branch - break; - } - if (sc[q] == 0) { - values = getValues(eq[q]); - int j = index; - for (int k = 0; k < values.length; k++) { - if (j < il.length && values[k] > il[j]) { - il[j] = values[k]; - } - j++; - } - break; - } else { - q = lo[q]; - - /** - * actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but - * java chars are unsigned - */ - } - } - } else { - p = d < 0 ? lo[p] : hi[p]; - } - } - } - - /** - * Hyphenate word and return a Hyphenation object. - * - * @param word the word to be hyphenated - * @param remainCharCount Minimum number of characters allowed before the - * hyphenation point. - * @param pushCharCount Minimum number of characters allowed after the - * hyphenation point. - * @return a {@link Hyphenation Hyphenation} object representing the - * hyphenated word or null if word is not hyphenated. - */ - public Hyphenation hyphenate(String word, int remainCharCount, - int pushCharCount) { - char[] w = word.toCharArray(); - return hyphenate(w, 0, w.length, remainCharCount, pushCharCount); - } - - /** - * w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n - * may be absent, the first n is at offset, the first l is at offset + - * iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied - * into word. In the first part of the routine len = w.length, in the second - * part of the routine len = word.length. Three indices are used: index(w), - * the index in w, index(word), the index in word, letterindex(word), the - * index in the letter part of word. The following relations exist: index(w) = - * offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) = - * index(word) - 1 (see first loop). It follows that: index(w) - index(word) = - * offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset + - * iIgnoreAtBeginning - */ - - /** - * Hyphenate word and return an array of hyphenation points. - * - * @param w char array that contains the word - * @param offset Offset to first character in word - * @param len Length of word - * @param remainCharCount Minimum number of characters allowed before the - * hyphenation point. - * @param pushCharCount Minimum number of characters allowed after the - * hyphenation point. - * @return a {@link Hyphenation Hyphenation} object representing the - * hyphenated word or null if word is not hyphenated. - */ - public Hyphenation hyphenate(char[] w, int offset, int len, - int remainCharCount, int pushCharCount) { - int i; - char[] word = new char[len + 3]; - - // normalize word - char[] c = new char[2]; - int iIgnoreAtBeginning = 0; - int iLength = len; - boolean bEndOfLetters = false; - for (i = 1; i <= len; i++) { - c[0] = w[offset + i - 1]; - int nc = classmap.find(c, 0); - if (nc < 0) { // found a non-letter character ... - if (i == (1 + iIgnoreAtBeginning)) { - // ... before any letter character - iIgnoreAtBeginning++; - } else { - // ... after a letter character - bEndOfLetters = true; - } - iLength--; - } else { - if (!bEndOfLetters) { - word[i - iIgnoreAtBeginning] = (char) nc; - } else { - return null; - } - } - } - len = iLength; - if (len < (remainCharCount + pushCharCount)) { - // word is too short to be hyphenated - return null; - } - int[] result = new int[len + 1]; - int k = 0; - - // check exception list first - String sw = new String(word, 1, len); - if (stoplist.containsKey(sw)) { - // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = - // null) - ArrayList hw = stoplist.get(sw); - int j = 0; - for (i = 0; i < hw.size(); i++) { - Object o = hw.get(i); - // j = index(sw) = letterindex(word)? - // result[k] = corresponding index(w) - if (o instanceof String) { - j += ((String) o).length(); - if (j >= remainCharCount && j < (len - pushCharCount)) { - result[k++] = j + iIgnoreAtBeginning; - } - } - } - } else { - // use algorithm to get hyphenation points - word[0] = '.'; // word start marker - word[len + 1] = '.'; // word end marker - word[len + 2] = 0; // null terminated - byte[] il = new byte[len + 3]; // initialized to zero - for (i = 0; i < len + 1; i++) { - searchPatterns(word, i, il); - } - - // hyphenation points are located where interletter value is odd - // i is letterindex(word), - // i + 1 is index(word), - // result[k] = corresponding index(w) - for (i = 0; i < len; i++) { - if (((il[i + 1] & 1) == 1) && i >= remainCharCount - && i <= (len - pushCharCount)) { - result[k++] = i + iIgnoreAtBeginning; - } - } - } - - if (k > 0) { - // trim result array - int[] res = new int[k+2]; - System.arraycopy(result, 0, res, 1, k); - // We add the synthetical hyphenation points - // at the beginning and end of the word - res[0]=0; - res[k+1]=len; - return new Hyphenation(res); - } else { - return null; - } - } - - /** - * Add a character class to the tree. It is used by - * {@link PatternParser PatternParser} as callback to add character classes. - * Character classes define the valid word characters for hyphenation. If a - * word contains a character not defined in any of the classes, it is not - * hyphenated. It also defines a way to normalize the characters in order to - * compare them with the stored patterns. Usually pattern files use only lower - * case characters, in this case a class for letter 'a', for example, should - * be defined as "aA", the first character being the normalization char. - */ - public void addClass(String chargroup) { - if (chargroup.length() > 0) { - char equivChar = chargroup.charAt(0); - char[] key = new char[2]; - key[1] = 0; - for (int i = 0; i < chargroup.length(); i++) { - key[0] = chargroup.charAt(i); - classmap.insert(key, 0, equivChar); - } - } - } - - /** - * Add an exception to the tree. It is used by - * {@link PatternParser PatternParser} class as callback to store the - * hyphenation exceptions. - * - * @param word normalized word - * @param hyphenatedword a vector of alternating strings and - * {@link Hyphen hyphen} objects. - */ - public void addException(String word, ArrayList hyphenatedword) { - stoplist.put(word, hyphenatedword); - } - - /** - * Add a pattern to the tree. Mainly, to be used by - * {@link PatternParser PatternParser} class as callback to add a pattern to - * the tree. - * - * @param pattern the hyphenation pattern - * @param ivalue interletter weight values indicating the desirability and - * priority of hyphenating at a given point within the pattern. It - * should contain only digit characters. (i.e. '0' to '9'). - */ - public void addPattern(String pattern, String ivalue) { - int k = ivalues.find(ivalue); - if (k <= 0) { - k = packValues(ivalue); - ivalues.insert(ivalue, (char) k); - } - insert(pattern, (char) k); - } - - @Override - public void printStats() { - System.out.println("Value space size = " - + Integer.toString(vspace.length())); - super.printStats(); - - } -}