X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java?ds=sidebyside diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java new file mode 100644 index 0000000..32980cc --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java @@ -0,0 +1,171 @@ +package org.apache.lucene.analysis.cz; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.analysis.util.StemmerUtil.*; + +/** + * Light Stemmer for Czech. + *

+ * Implements the algorithm described in: + * + * Indexing and stemming approaches for the Czech language + * + * http://portal.acm.org/citation.cfm?id=1598600 + *

+ */ +public class CzechStemmer { + + /** + * Stem an input buffer of Czech text. + * + * @param s input buffer + * @param len length of input buffer + * @return length of input buffer after normalization + * + *

NOTE: Input is expected to be in lowercase, + * but with diacritical marks

+ */ + public int stem(char s[], int len) { + len = removeCase(s, len); + len = removePossessives(s, len); + len = normalize(s, len); + return len; + } + + private int removeCase(char s[], int len) { + if (len > 7 && endsWith(s, len, "atech")) + return len - 5; + + if (len > 6 && + (endsWith(s, len,"ětem") || + endsWith(s, len,"etem") || + endsWith(s, len,"atům"))) + return len - 4; + + if (len > 5 && + (endsWith(s, len, "ech") || + endsWith(s, len, "ich") || + endsWith(s, len, "ích") || + endsWith(s, len, "ého") || + endsWith(s, len, "ěmi") || + endsWith(s, len, "emi") || + endsWith(s, len, "ému") || + endsWith(s, len, "ěte") || + endsWith(s, len, "ete") || + endsWith(s, len, "ěti") || + endsWith(s, len, "eti") || + endsWith(s, len, "ího") || + endsWith(s, len, "iho") || + endsWith(s, len, "ími") || + endsWith(s, len, "ímu") || + endsWith(s, len, "imu") || + endsWith(s, len, "ách") || + endsWith(s, len, "ata") || + endsWith(s, len, "aty") || + endsWith(s, len, "ých") || + endsWith(s, len, "ama") || + endsWith(s, len, "ami") || + endsWith(s, len, "ové") || + endsWith(s, len, "ovi") || + endsWith(s, len, "ými"))) + return len - 3; + + if (len > 4 && + (endsWith(s, len, "em") || + endsWith(s, len, "es") || + endsWith(s, len, "ém") || + endsWith(s, len, "ím") || + endsWith(s, len, "ům") || + endsWith(s, len, "at") || + endsWith(s, len, "ám") || + endsWith(s, len, "os") || + endsWith(s, len, "us") || + endsWith(s, len, "ým") || + endsWith(s, len, "mi") || + endsWith(s, len, "ou"))) + return len - 2; + + if (len > 3) { + switch (s[len - 1]) { + case 'a': + case 'e': + case 'i': + case 'o': + case 'u': + case 'ů': + case 'y': + case 'á': + case 'é': + case 'í': + case 'ý': + case 'ě': + return len - 1; + } + } + + return len; + } + + private int removePossessives(char s[], int len) { + if (len > 5 && + (endsWith(s, len, "ov") || + endsWith(s, len, "in") || + endsWith(s, len, "ův"))) + return len - 2; + + return len; + } + + private int normalize(char s[], int len) { + if (endsWith(s, len, "čt")) { // čt -> ck + s[len - 2] = 'c'; + s[len - 1] = 'k'; + return len; + } + + if (endsWith(s, len, "št")) { // št -> sk + s[len - 2] = 's'; + s[len - 1] = 'k'; + return len; + } + + switch(s[len - 1]) { + case 'c': // [cč] -> k + case 'č': + s[len - 1] = 'k'; + return len; + case 'z': // [zž] -> h + case 'ž': + s[len - 1] = 'h'; + return len; + } + + if (len > 1 && s[len - 2] == 'e') { + s[len - 2] = s[len - 1]; // e* > * + return len - 1; + } + + if (len > 2 && s[len - 2] == 'ů') { + s[len - 2] = 'o'; // *ů* -> *o* + return len; + } + + return len; + } +}