X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java?ds=sidebyside

diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java
new file mode 100644
index 0000000..32980cc
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java
@@ -0,0 +1,171 @@
+package org.apache.lucene.analysis.cz;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Light Stemmer for Czech.
+ * <p>
+ * Implements the algorithm described in:  
+ * <i>
+ * Indexing and stemming approaches for the Czech language
+ * </i>
+ * http://portal.acm.org/citation.cfm?id=1598600
+ * </p>
+ */
+public class CzechStemmer {
+  
+  /**
+   * Stem an input buffer of Czech text.
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   * 
+   * <p><b>NOTE</b>: Input is expected to be in lowercase, 
+   * but with diacritical marks</p>
+   */
+  public int stem(char s[], int len) {
+    len = removeCase(s, len);
+    len = removePossessives(s, len);
+    len = normalize(s, len);
+    return len;
+  }
+  
+  private int removeCase(char s[], int len) {  
+    if (len > 7 && endsWith(s, len, "atech"))
+      return len - 5;
+    
+    if (len > 6 && 
+        (endsWith(s, len,"Ätem") ||
+        endsWith(s, len,"etem") ||
+        endsWith(s, len,"atÅ¯m")))
+      return len - 4;
+        
+    if (len > 5 && 
+        (endsWith(s, len, "ech") ||
+        endsWith(s, len, "ich") ||
+        endsWith(s, len, "Ã­ch") ||
+        endsWith(s, len, "Ã©ho") ||
+        endsWith(s, len, "Ämi") ||
+        endsWith(s, len, "emi") ||
+        endsWith(s, len, "Ã©mu") ||
+        endsWith(s, len, "Äte") ||
+        endsWith(s, len, "ete") ||
+        endsWith(s, len, "Äti") ||
+        endsWith(s, len, "eti") ||
+        endsWith(s, len, "Ã­ho") ||
+        endsWith(s, len, "iho") ||
+        endsWith(s, len, "Ã­mi") ||
+        endsWith(s, len, "Ã­mu") ||
+        endsWith(s, len, "imu") ||
+        endsWith(s, len, "Ã¡ch") ||
+        endsWith(s, len, "ata") ||
+        endsWith(s, len, "aty") ||
+        endsWith(s, len, "Ã½ch") ||
+        endsWith(s, len, "ama") ||
+        endsWith(s, len, "ami") ||
+        endsWith(s, len, "ovÃ©") ||
+        endsWith(s, len, "ovi") ||
+        endsWith(s, len, "Ã½mi")))
+      return len - 3;
+    
+    if (len > 4 && 
+        (endsWith(s, len, "em") ||
+        endsWith(s, len, "es") ||
+        endsWith(s, len, "Ã©m") ||
+        endsWith(s, len, "Ã­m") ||
+        endsWith(s, len, "Å¯m") ||
+        endsWith(s, len, "at") ||
+        endsWith(s, len, "Ã¡m") ||
+        endsWith(s, len, "os") ||
+        endsWith(s, len, "us") ||
+        endsWith(s, len, "Ã½m") ||
+        endsWith(s, len, "mi") ||
+        endsWith(s, len, "ou")))
+      return len - 2;
+    
+    if (len > 3) {
+      switch (s[len - 1]) {
+        case 'a':
+        case 'e':
+        case 'i':
+        case 'o':
+        case 'u':
+        case 'Å¯':
+        case 'y':
+        case 'Ã¡':
+        case 'Ã©':
+        case 'Ã­':
+        case 'Ã½':
+        case 'Ä':
+          return len - 1;
+      }
+    }
+    
+    return len;
+  }
+  
+  private int removePossessives(char s[], int len) {
+    if (len > 5 &&
+        (endsWith(s, len, "ov") ||
+        endsWith(s, len, "in") ||
+        endsWith(s, len, "Å¯v")))
+      return len - 2;
+
+    return len;
+  }
+  
+  private int normalize(char s[], int len) {
+    if (endsWith(s, len, "Ät")) { // Ät -> ck
+      s[len - 2] = 'c';
+      s[len - 1] = 'k';
+      return len;
+    }
+    
+    if (endsWith(s, len, "Å¡t")) { // Å¡t -> sk
+      s[len - 2] = 's';
+      s[len - 1] = 'k';
+      return len;
+    }
+    
+    switch(s[len - 1]) {
+      case 'c': // [cÄ] -> k
+      case 'Ä':
+        s[len - 1] = 'k';
+        return len;
+      case 'z': // [zÅ¾] -> h
+      case 'Å¾':
+        s[len - 1] = 'h';
+        return len;
+    }
+    
+    if (len > 1 && s[len - 2] == 'e') {
+      s[len - 2] = s[len - 1]; // e* > *
+      return len - 1;
+    }
+    
+    if (len > 2 && s[len - 2] == 'Å¯') {
+      s[len - 2] = 'o'; // *Å¯* -> *o*
+      return len;
+    }
+
+    return len;
+  }
+}