1 package org.apache.lucene.analysis.cz;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import static org.apache.lucene.analysis.util.StemmerUtil.*;
23 * Light Stemmer for Czech.
25 * Implements the algorithm described in:
27 * Indexing and stemming approaches for the Czech language
29 * http://portal.acm.org/citation.cfm?id=1598600
32 public class CzechStemmer {
35 * Stem an input buffer of Czech text.
37 * @param s input buffer
38 * @param len length of input buffer
39 * @return length of input buffer after normalization
41 * <p><b>NOTE</b>: Input is expected to be in lowercase,
42 * but with diacritical marks</p>
44 public int stem(char s[], int len) {
45 len = removeCase(s, len);
46 len = removePossessives(s, len);
47 len = normalize(s, len);
51 private int removeCase(char s[], int len) {
52 if (len > 7 && endsWith(s, len, "atech"))
56 (endsWith(s, len,"ětem") ||
57 endsWith(s, len,"etem") ||
58 endsWith(s, len,"atům")))
62 (endsWith(s, len, "ech") ||
63 endsWith(s, len, "ich") ||
64 endsWith(s, len, "ích") ||
65 endsWith(s, len, "ého") ||
66 endsWith(s, len, "ěmi") ||
67 endsWith(s, len, "emi") ||
68 endsWith(s, len, "ému") ||
69 endsWith(s, len, "ěte") ||
70 endsWith(s, len, "ete") ||
71 endsWith(s, len, "ěti") ||
72 endsWith(s, len, "eti") ||
73 endsWith(s, len, "ího") ||
74 endsWith(s, len, "iho") ||
75 endsWith(s, len, "ími") ||
76 endsWith(s, len, "ímu") ||
77 endsWith(s, len, "imu") ||
78 endsWith(s, len, "ách") ||
79 endsWith(s, len, "ata") ||
80 endsWith(s, len, "aty") ||
81 endsWith(s, len, "ých") ||
82 endsWith(s, len, "ama") ||
83 endsWith(s, len, "ami") ||
84 endsWith(s, len, "ové") ||
85 endsWith(s, len, "ovi") ||
86 endsWith(s, len, "ými")))
90 (endsWith(s, len, "em") ||
91 endsWith(s, len, "es") ||
92 endsWith(s, len, "ém") ||
93 endsWith(s, len, "ím") ||
94 endsWith(s, len, "ům") ||
95 endsWith(s, len, "at") ||
96 endsWith(s, len, "ám") ||
97 endsWith(s, len, "os") ||
98 endsWith(s, len, "us") ||
99 endsWith(s, len, "ým") ||
100 endsWith(s, len, "mi") ||
101 endsWith(s, len, "ou")))
105 switch (s[len - 1]) {
125 private int removePossessives(char s[], int len) {
127 (endsWith(s, len, "ov") ||
128 endsWith(s, len, "in") ||
129 endsWith(s, len, "ův")))
135 private int normalize(char s[], int len) {
136 if (endsWith(s, len, "čt")) { // čt -> ck
142 if (endsWith(s, len, "št")) { // št -> sk
149 case 'c': // [cč] -> k
153 case 'z': // [zž] -> h
159 if (len > 1 && s[len - 2] == 'e') {
160 s[len - 2] = s[len - 1]; // e* > *
164 if (len > 2 && s[len - 2] == 'ů') {
165 s[len - 2] = 'o'; // *ů* -> *o*