X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekStemmer.java diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekStemmer.java b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekStemmer.java new file mode 100644 index 0000000..53cef6a --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekStemmer.java @@ -0,0 +1,819 @@ +package org.apache.lucene.analysis.el; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.util.Version; + +import java.util.Arrays; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A stemmer for Greek words, according to: Development of a Stemmer for the + * Greek Language. Georgios Ntais + *
+ * NOTE: Input is expected to be casefolded for Greek (including folding of final + * sigma to sigma), and with diacritics removed. This can be achieved with + * either {@link GreekLowerCaseFilter} or ICUFoldingFilter. + * @lucene.experimental + */ +public class GreekStemmer { + public int stem(char s[], int len) { + if (len < 4) // too short + return len; + + final int origLen = len; + // "short rules": if it hits one of these, it skips the "long list" + len = rule0(s, len); + len = rule1(s, len); + len = rule2(s, len); + len = rule3(s, len); + len = rule4(s, len); + len = rule5(s, len); + len = rule6(s, len); + len = rule7(s, len); + len = rule8(s, len); + len = rule9(s, len); + len = rule10(s, len); + len = rule11(s, len); + len = rule12(s, len); + len = rule13(s, len); + len = rule14(s, len); + len = rule15(s, len); + len = rule16(s, len); + len = rule17(s, len); + len = rule18(s, len); + len = rule19(s, len); + len = rule20(s, len); + // "long list" + if (len == origLen) + len = rule21(s, len); + + return rule22(s, len); + } + + private int rule0(char s[], int len) { + if (len > 9 && (endsWith(s, len, "καθεÏÏÏÏοÏ") + || endsWith(s, len, "καθεÏÏÏÏÏν"))) + return len - 4; + + if (len > 8 && (endsWith(s, len, "γεγονοÏοÏ") + || endsWith(s, len, "γεγονοÏÏν"))) + return len - 4; + + if (len > 8 && endsWith(s, len, "καθεÏÏÏÏα")) + return len - 3; + + if (len > 7 && (endsWith(s, len, "ÏαÏÎ¿Î³Î¹Î¿Ï ") + || endsWith(s, len, "ÏαÏογιÏν"))) + return len - 4; + + if (len > 7 && endsWith(s, len, "γεγονοÏα")) + return len - 3; + + if (len > 7 && endsWith(s, len, "καθεÏÏÏÏ")) + return len - 2; + + if (len > 6 && (endsWith(s, len, "ÏÎºÎ±Î³Î¹Î¿Ï ")) + || endsWith(s, len, "ÏκαγιÏν") + || endsWith(s, len, "Î¿Î»Î¿Î³Î¹Î¿Ï ") + || endsWith(s, len, "ολογιÏν") + || endsWith(s, len, "κÏεαÏοÏ") + || endsWith(s, len, "κÏεαÏÏν") + || endsWith(s, len, "ÏεÏαÏοÏ") + || endsWith(s, len, "ÏεÏαÏÏν") + || endsWith(s, len, "ÏεÏαÏοÏ") + || endsWith(s, len, "ÏεÏαÏÏν")) + return len - 4; + + if (len > 6 && endsWith(s, len, "ÏαÏογια")) + return len - 3; + + if (len > 6 && endsWith(s, len, "γεγονοÏ")) + return len - 2; + + if (len > 5 && (endsWith(s, len, "ÏÎ±Î³Î¹Î¿Ï ") + || endsWith(s, len, "ÏαγιÏν") + || endsWith(s, len, "ÏÎ¿Î³Î¹Î¿Ï ") + || endsWith(s, len, "ÏογιÏν"))) + return len - 4; + + if (len > 5 && (endsWith(s, len, "Ïκαγια") + || endsWith(s, len, "ολογια") + || endsWith(s, len, "κÏεαÏα") + || endsWith(s, len, "ÏεÏαÏα") + || endsWith(s, len, "ÏεÏαÏα"))) + return len - 3; + + if (len > 4 && (endsWith(s, len, "Ïαγια") + || endsWith(s, len, "Ïογια") + || endsWith(s, len, "ÏÏÏοÏ") + || endsWith(s, len, "ÏÏÏÏν"))) + return len - 3; + + if (len > 4 && (endsWith(s, len, "κÏεαÏ") + || endsWith(s, len, "ÏεÏαÏ") + || endsWith(s, len, "ÏεÏαÏ"))) + return len - 2; + + if (len > 3 && endsWith(s, len, "ÏÏÏα")) + return len - 2; + + if (len > 2 && endsWith(s, len, "ÏÏÏ")) + return len - 1; + + return len; + } + + private int rule1(char s[], int len) { + if (len > 4 && (endsWith(s, len, "αδεÏ") || endsWith(s, len, "αδÏν"))) { + len -= 4; + if (!(endsWith(s, len, "οκ") || + endsWith(s, len, "μαμ") || + endsWith(s, len, "μαν") || + endsWith(s, len, "μÏαμÏ") || + endsWith(s, len, "ÏαÏεÏ") || + endsWith(s, len, "γιαγι") || + endsWith(s, len, "νÏανÏ") || + endsWith(s, len, "ÎºÏ Ï") || + endsWith(s, len, "θει") || + endsWith(s, len, "ÏεθεÏ"))) + len += 2; // add back -αδ + } + return len; + } + + private int rule2(char s[], int len) { + if (len > 4 && (endsWith(s, len, "εδεÏ") || endsWith(s, len, "εδÏν"))) { + len -= 4; + if (endsWith(s, len, "οÏ") || + endsWith(s, len, "ιÏ") || + endsWith(s, len, "εμÏ") || + endsWith(s, len, "Ï Ï") || + endsWith(s, len, "γηÏ") || + endsWith(s, len, "δαÏ") || + endsWith(s, len, "κÏαÏÏ") || + endsWith(s, len, "μιλ")) + len += 2; // add back -εδ + } + return len; + } + + private int rule3(char s[], int len) { + if (len > 5 && (endsWith(s, len, "Î¿Ï Î´ÎµÏ") || endsWith(s, len, "Î¿Ï Î´Ïν"))) { + len -= 5; + if (endsWith(s, len, "αÏκ") || + endsWith(s, len, "καλιακ") || + endsWith(s, len, "ÏεÏαλ") || + endsWith(s, len, "λιÏ") || + endsWith(s, len, "Ïλεξ") || + endsWith(s, len, "Ïκ") || + endsWith(s, len, "Ï") || + endsWith(s, len, "Ïλ") || + endsWith(s, len, "ÏÏ") || + endsWith(s, len, "βελ") || + endsWith(s, len, "Î»Î¿Ï Î»") || + endsWith(s, len, "Ïν") || + endsWith(s, len, "ÏÏ") || + endsWith(s, len, "ÏÏαγ") || + endsWith(s, len, "Ïε")) + len += 3; // add back -Î¿Ï Î´ + } + return len; + } + + private static final CharArraySet exc4 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("θ", "δ", "ελ", "γαλ", "ν", "Ï", "ιδ", "ÏαÏ"), + false); + + private int rule4(char s[], int len) { + if (len > 3 && (endsWith(s, len, "εÏÏ") || endsWith(s, len, "εÏν"))) { + len -= 3; + if (exc4.contains(s, 0, len)) + len++; // add back -ε + } + return len; + } + + private int rule5(char s[], int len) { + if (len > 2 && endsWith(s, len, "ια")) { + len -= 2; + if (endsWithVowel(s, len)) + len++; // add back -ι + } else if (len > 3 && (endsWith(s, len, "Î¹Î¿Ï ") || endsWith(s, len, "ιÏν"))) { + len -= 3; + if (endsWithVowel(s, len)) + len++; // add back -ι + } + return len; + } + + private static final CharArraySet exc6 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("αλ", "αδ", "ενδ", "αμαν", "αμμοÏαλ", "ηθ", "ανηθ", + "ανÏιδ", "ÏÏ Ï", "βÏÏμ", "γεÏ", "εξÏδ", "καλÏ", "καλλιν", "καÏαδ", + "Î¼Î¿Ï Î»", "μÏαν", "μÏαγιαÏ", "μÏολ", "μÏοÏ", "νιÏ", "ξικ", "ÏÏ Î½Î¿Î¼Î·Î»", + "ÏεÏÏ", "ÏιÏÏ", "ÏικανÏ", "ÏλιαÏÏ", "ÏοÏÏελν", "ÏÏÏÏοδ", "ÏεÏÏ", + "ÏÏ Î½Î±Î´", "ÏÏαμ", "Ï Ïοδ", "Ïιλον", "ÏÏ Î»Î¿Î´", "ÏαÏ"), + false); + + private int rule6(char s[], int len) { + boolean removed = false; + if (len > 3 && (endsWith(s, len, "ικα") || endsWith(s, len, "ικο"))) { + len -= 3; + removed = true; + } else if (len > 4 && (endsWith(s, len, "Î¹ÎºÎ¿Ï ") || endsWith(s, len, "ικÏν"))) { + len -= 4; + removed = true; + } + + if (removed) { + if (endsWithVowel(s, len) || exc6.contains(s, 0, len)) + len += 2; // add back -ικ + } + return len; + } + + private static final CharArraySet exc7 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("αναÏ", "αÏοθ", "αÏοκ", "αÏοÏÏ", "Î²Î¿Ï Î²", "ξεθ", "Î¿Ï Î»", + "Ïεθ", "ÏικÏ", "ÏοÏ", "ÏιÏ", "Ï"), + false); + + private int rule7(char s[], int len) { + if (len == 5 && endsWith(s, len, "αγαμε")) + return len - 1; + + if (len > 7 && endsWith(s, len, "ηθηκαμε")) + len -= 7; + else if (len > 6 && endsWith(s, len, "Î¿Ï Ïαμε")) + len -= 6; + else if (len > 5 && (endsWith(s, len, "αγαμε") || + endsWith(s, len, "ηÏαμε") || + endsWith(s, len, "ηκαμε"))) + len -= 5; + + if (len > 3 && endsWith(s, len, "αμε")) { + len -= 3; + if (exc7.contains(s, 0, len)) + len += 2; // add back -αμ + } + + return len; + } + + private static final CharArraySet exc8a = new CharArraySet(Version.LUCENE_31, + Arrays.asList("ÏÏ", "ÏÏ"), + false); + + private static final CharArraySet exc8b = new CharArraySet(Version.LUCENE_31, + Arrays.asList("βεÏεÏ", "Î²Î¿Ï Î»Îº", "βÏαÏμ", "γ", "δÏÎ±Î´Î¿Ï Î¼", "θ", "καλÏÎ¿Ï Î¶", + "καÏÏελ", "κοÏμοÏ", "λαοÏλ", "μÏαμεθ", "μ", "Î¼Î¿Ï ÏÎ¿Ï Î»Î¼", "ν", "Î¿Ï Î»", + "Ï", "Ïελεκ", "Ïλ", "ÏολιÏ", "ÏοÏÏολ", "ÏαÏακαÏÏ", "ÏÎ¿Ï Î»Ï", + "ÏÏαÏλαÏ", "οÏÏ", "ÏÏιγγ", "ÏÏοÏ", "ÏÏÏοÏÏεÏ", "Ï", "ÏÏ ÏοÏλ", "αγ", + "οÏÏ", "γαλ", "γεÏ", "δεκ", "διÏλ", "αμεÏικαν", "Î¿Ï Ï", "Ïιθ", + "ÏÎ¿Ï ÏιÏ", "Ï", "ζÏνÏ", "ικ", "καÏÏ", "κοÏ", "λιÏ", "Î»Î¿Ï Î¸Î·Ï", "μαινÏ", + "μελ", "Ïιγ", "ÏÏ", "ÏÏεγ", "ÏÏαγ", "ÏÏαγ", "Ï", "εÏ", "αδαÏ", + "αθιγγ", "αμηÏ", "ανικ", "ανοÏγ", "αÏηγ", "αÏιθ", "αÏÏιγγ", "βαÏ", + "βαÏκ", "Î²Î±Î¸Ï Î³Î±Î»", "βιομηÏ", "βÏαÏÏ Îº", "διαÏ", "διαÏ", "ενοÏγ", + "Î¸Ï Ï", "καÏνοβιομηÏ", "καÏαγαλ", "κλιβ", "κοιλαÏÏ", "λιβ", + "μεγλοβιομηÏ", "μικÏοβιομηÏ", "νÏαβ", "ξηÏοκλιβ", "ολιγοδαμ", + "ολογαλ", "ÏενÏαÏÏ", "ÏεÏηÏ", "ÏεÏιÏÏ", "ÏλαÏ", "ÏÎ¿Î»Ï Î´Î±Ï", "ÏÎ¿Î»Ï Î¼Î·Ï", + "ÏÏεÏ", "Ïαβ", "ÏεÏ", "Ï ÏεÏηÏ", "Ï ÏοκοÏ", "ÏαμηλοδαÏ", "ÏηλοÏαβ"), + false); + + private int rule8(char s[], int len) { + boolean removed = false; + + if (len > 8 && endsWith(s, len, "Î¹Î¿Ï Î½Ïανε")) { + len -= 8; + removed = true; + } else if (len > 7 && endsWith(s, len, "ιονÏανε") || + endsWith(s, len, "Î¿Ï Î½Ïανε") || + endsWith(s, len, "ηθηκανε")) { + len -= 7; + removed = true; + } else if (len > 6 && endsWith(s, len, "ιοÏανε") || + endsWith(s, len, "ονÏανε") || + endsWith(s, len, "Î¿Ï Ïανε")) { + len -= 6; + removed = true; + } else if (len > 5 && endsWith(s, len, "αγανε") || + endsWith(s, len, "ηÏανε") || + endsWith(s, len, "οÏανε") || + endsWith(s, len, "ηκανε")) { + len -= 5; + removed = true; + } + + if (removed && exc8a.contains(s, 0, len)) { + // add -αγαν (we removed > 4 chars so its safe) + len += 4; + s[len - 4] = 'α'; + s[len - 3] = 'γ'; + s[len - 2] = 'α'; + s[len - 1] = 'ν'; + } + + if (len > 3 && endsWith(s, len, "ανε")) { + len -= 3; + if (endsWithVowelNoY(s, len) || exc8b.contains(s, 0, len)) { + len += 2; // add back -αν + } + } + + return len; + } + + private static final CharArraySet exc9 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("αβαÏ", "βεν", "εναÏ", "αβÏ", "αδ", "αθ", "αν", "αÏλ", + "βαÏον", "νÏÏ", "Ïκ", "κοÏ", "μÏοÏ", "νιÏ", "Ïαγ", "ÏαÏακαλ", "ÏεÏÏ", + "Ïκελ", "ÏÏ ÏÏ", "Ïοκ", "Ï ", "δ", "εμ", "θαÏÏ", "θ"), + false); + + private int rule9(char s[], int len) { + if (len > 5 && endsWith(s, len, "ηÏεÏε")) + len -= 5; + + if (len > 3 && endsWith(s, len, "εÏε")) { + len -= 3; + if (exc9.contains(s, 0, len) || + endsWithVowelNoY(s, len) || + endsWith(s, len, "οδ") || + endsWith(s, len, "αιÏ") || + endsWith(s, len, "ÏοÏ") || + endsWith(s, len, "Ïαθ") || + endsWith(s, len, "διαθ") || + endsWith(s, len, "ÏÏ") || + endsWith(s, len, "ενδ") || + endsWith(s, len, "ÎµÏ Ï") || + endsWith(s, len, "Ïιθ") || + endsWith(s, len, "Ï ÏεÏθ") || + endsWith(s, len, "Ïαθ") || + endsWith(s, len, "ενθ") || + endsWith(s, len, "Ïοθ") || + endsWith(s, len, "Ïθ") || + endsWith(s, len, "ÏÏ Ï") || + endsWith(s, len, "αιν") || + endsWith(s, len, "ÏÏ Î½Î´") || + endsWith(s, len, "ÏÏ Î½") || + endsWith(s, len, "ÏÏ Î½Î¸") || + endsWith(s, len, "ÏÏÏ") || + endsWith(s, len, "Ïον") || + endsWith(s, len, "βÏ") || + endsWith(s, len, "καθ") || + endsWith(s, len, "ÎµÏ Î¸") || + endsWith(s, len, "εκθ") || + endsWith(s, len, "νεÏ") || + endsWith(s, len, "Ïον") || + endsWith(s, len, "αÏκ") || + endsWith(s, len, "βαÏ") || + endsWith(s, len, "βολ") || + endsWith(s, len, "ÏÏελ")) { + len += 2; // add back -ÎµÏ + } + } + + return len; + } + + private int rule10(char s[], int len) { + if (len > 5 && (endsWith(s, len, "ονÏαÏ") || endsWith(s, len, "ÏνÏαÏ"))) { + len -= 5; + if (len == 3 && endsWith(s, len, "αÏÏ")) { + len += 3; // add back *Î½Ï + s[len - 3] = 'ο'; + } + if (endsWith(s, len, "κÏε")) { + len += 3; // add back *Î½Ï + s[len - 3] = 'Ï'; + } + } + + return len; + } + + private int rule11(char s[], int len) { + if (len > 6 && endsWith(s, len, "ομαÏÏε")) { + len -= 6; + if (len == 2 && endsWith(s, len, "ον")) { + len += 5; // add back -ομαÏÏ + } + } else if (len > 7 && endsWith(s, len, "ιομαÏÏε")) { + len -= 7; + if (len == 2 && endsWith(s, len, "ον")) { + len += 5; + s[len - 5] = 'ο'; + s[len - 4] = 'μ'; + s[len - 3] = 'α'; + s[len - 2] = 'Ï'; + s[len - 1] = 'Ï'; + } + } + return len; + } + + private static final CharArraySet exc12a = new CharArraySet(Version.LUCENE_31, + Arrays.asList("Ï", "αÏ", "ÏÏ Î¼Ï", "αÏÏ Î¼Ï", "ακαÏαÏ", "αμεÏαμÏ"), + false); + + private static final CharArraySet exc12b = new CharArraySet(Version.LUCENE_31, + Arrays.asList("αλ", "αÏ", "εκÏελ", "ζ", "μ", "ξ", "ÏαÏακαλ", "αÏ", "ÏÏο", "νιÏ"), + false); + + private int rule12(char s[], int len) { + if (len > 5 && endsWith(s, len, "ιεÏÏε")) { + len -= 5; + if (exc12a.contains(s, 0, len)) + len += 4; // add back -ιεÏÏ + } + + if (len > 4 && endsWith(s, len, "εÏÏε")) { + len -= 4; + if (exc12b.contains(s, 0, len)) + len += 3; // add back -εÏÏ + } + + return len; + } + + private static final CharArraySet exc13 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("διαθ", "θ", "ÏαÏακαÏαθ", "ÏÏοÏθ", "ÏÏ Î½Î¸"), + false); + + private int rule13(char s[], int len) { + if (len > 6 && endsWith(s, len, "ηθηκεÏ")) { + len -= 6; + } else if (len > 5 && (endsWith(s, len, "ηθηκα") || endsWith(s, len, "ηθηκε"))) { + len -= 5; + } + + boolean removed = false; + + if (len > 4 && endsWith(s, len, "ηκεÏ")) { + len -= 4; + removed = true; + } else if (len > 3 && (endsWith(s, len, "ηκα") || endsWith(s, len, "ηκε"))) { + len -= 3; + removed = true; + } + + if (removed && (exc13.contains(s, 0, len) + || endsWith(s, len, "ÏκÏλ") + || endsWith(s, len, "ÏÎºÎ¿Ï Î»") + || endsWith(s, len, "ναÏθ") + || endsWith(s, len, "ÏÏ") + || endsWith(s, len, "οθ") + || endsWith(s, len, "Ïιθ"))) { + len += 2; // add back the -ηκ + } + + return len; + } + + private static final CharArraySet exc14 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("ÏαÏμακ", "Ïαδ", "αγκ", "αναÏÏ", "βÏομ", "εκλιÏ", "λαμÏιδ", + "λεÏ", "μ", "ÏαÏ", "Ï", "λ", "μεδ", "μεÏαζ", "Ï ÏοÏειν", "αμ", "αιθ", + "ανηκ", "δεÏÏοζ", "ενδιαÏεÏ", "δε", "Î´ÎµÏ ÏεÏÎµÏ ", "καθαÏÎµÏ ", "Ïλε", + "ÏÏα"), + false); + + private int rule14(char s[], int len) { + boolean removed = false; + + if (len > 5 && endsWith(s, len, "Î¿Ï ÏεÏ")) { + len -= 5; + removed = true; + } else if (len > 4 && (endsWith(s, len, "Î¿Ï Ïα") || endsWith(s, len, "Î¿Ï Ïε"))) { + len -= 4; + removed = true; + } + + if (removed && (exc14.contains(s, 0, len) + || endsWithVowel(s, len) + || endsWith(s, len, "ÏοδαÏ") + || endsWith(s, len, "βλεÏ") + || endsWith(s, len, "ÏανÏαÏ") + || endsWith(s, len, "ÏÏÏ Î´") + || endsWith(s, len, "μανÏιλ") + || endsWith(s, len, "μαλλ") + || endsWith(s, len, "ÎºÏ Î¼Î±Ï") + || endsWith(s, len, "λαÏ") + || endsWith(s, len, "ληγ") + || endsWith(s, len, "Ïαγ") + || endsWith(s, len, "ομ") + || endsWith(s, len, "ÏÏÏÏ"))) { + len += 3; // add back -Î¿Ï Ï + } + + return len; + } + + private static final CharArraySet exc15a = new CharArraySet(Version.LUCENE_31, + Arrays.asList("αβαÏÏ", "ÏÎ¿Î»Ï Ï", "αδηÏ", "ÏαμÏ", "Ï", "αÏÏ", "αÏ", "αμαλ", + "αμαλλι", "Î±Î½Ï ÏÏ", "αÏεÏ", "αÏÏαÏ", "αÏαÏ", "δεÏβεν", "δÏοÏοÏ", + "ξεÏ", "νεοÏ", "νομοÏ", "ολοÏ", "ομοÏ", "ÏÏοÏÏ", "ÏÏοÏÏÏοÏ", "ÏÏ Î¼Ï", + "ÏÏ Î½Ï", "Ï", "Ï ÏοÏ", "ÏαÏ", "αειÏ", "αιμοÏÏ", "Î±Î½Ï Ï", "αÏοÏ", + "αÏÏιÏ", "διαÏ", "εν", "εÏιÏ", "κÏοκαλοÏ", "ÏιδηÏοÏ", "λ", "Î½Î±Ï ", + "Î¿Ï Î»Î±Î¼", "Î¿Ï Ï", "Ï", "ÏÏ", "μ"), + false); + + private static final CharArraySet exc15b = new CharArraySet(Version.LUCENE_31, + Arrays.asList("ÏοÏ", "Î½Î±Ï Î»Î¿Ï"), + false); + + private int rule15(char s[], int len) { + boolean removed = false; + if (len > 4 && endsWith(s, len, "αγεÏ")) { + len -= 4; + removed = true; + } else if (len > 3 && (endsWith(s, len, "αγα") || endsWith(s, len, "αγε"))) { + len -= 3; + removed = true; + } + + if (removed) { + final boolean cond1 = exc15a.contains(s, 0, len) + || endsWith(s, len, "οÏ") + || endsWith(s, len, "Ïελ") + || endsWith(s, len, "ÏοÏÏ") + || endsWith(s, len, "λλ") + || endsWith(s, len, "ÏÏ") + || endsWith(s, len, "ÏÏ") + || endsWith(s, len, "ÏÏ") + || endsWith(s, len, "ÏÏ") + || endsWith(s, len, "λοÏ") + || endsWith(s, len, "Ïμην"); + + final boolean cond2 = exc15b.contains(s, 0, len) + || endsWith(s, len, "κολλ"); + + if (cond1 && !cond2) + len += 2; // add back -αγ + } + + return len; + } + + private static final CharArraySet exc16 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("ν", "ÏεÏÏον", "δÏδεκαν", "εÏημον", "μεγαλον", "εÏÏαν"), + false); + + private int rule16(char s[], int len) { + boolean removed = false; + if (len > 4 && endsWith(s, len, "ηÏÎ¿Ï ")) { + len -= 4; + removed = true; + } else if (len > 3 && (endsWith(s, len, "ηÏε") || endsWith(s, len, "ηÏα"))) { + len -= 3; + removed = true; + } + + if (removed && exc16.contains(s, 0, len)) + len += 2; // add back -Î·Ï + + return len; + } + + private static final CharArraySet exc17 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("αÏβ", "Ïβ", "αÏÏ", "ÏÏ", "αÏλ", "αειμν", "Î´Ï ÏÏÏ", "ÎµÏ ÏÏ", "κοινοÏÏ", "ÏαλιμÏ"), + false); + + private int rule17(char s[], int len) { + if (len > 4 && endsWith(s, len, "ηÏÏε")) { + len -= 4; + if (exc17.contains(s, 0, len)) + len += 3; // add back the -ηÏÏ + } + + return len; + } + + private static final CharArraySet exc18 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("ν", "Ï", "ÏÏι", "ÏÏÏÎ±Î²Î¿Î¼Î¿Ï ÏÏ", "ÎºÎ±ÎºÎ¿Î¼Î¿Ï ÏÏ", "εξÏν"), + false); + + private int rule18(char s[], int len) { + boolean removed = false; + + if (len > 6 && (endsWith(s, len, "ηÏÎ¿Ï Î½Îµ") || endsWith(s, len, "Î·Î¸Î¿Ï Î½Îµ"))) { + len -= 6; + removed = true; + } else if (len > 4 && endsWith(s, len, "Î¿Ï Î½Îµ")) { + len -= 4; + removed = true; + } + + if (removed && exc18.contains(s, 0, len)) { + len += 3; + s[len - 3] = 'ο'; + s[len - 2] = 'Ï '; + s[len - 1] = 'ν'; + } + return len; + } + + private static final CharArraySet exc19 = new CharArraySet(Version.LUCENE_31, + Arrays.asList("ÏαÏαÏÎ¿Ï Ï", "Ï", "Ï", "ÏÏιοÏλ", "αζ", "αλλοÏÎ¿Ï Ï", "αÏÎ¿Ï Ï"), + false); + + private int rule19(char s[], int len) { + boolean removed = false; + + if (len > 6 && (endsWith(s, len, "ηÏÎ¿Ï Î¼Îµ") || endsWith(s, len, "Î·Î¸Î¿Ï Î¼Îµ"))) { + len -= 6; + removed = true; + } else if (len > 4 && endsWith(s, len, "Î¿Ï Î¼Îµ")) { + len -= 4; + removed = true; + } + + if (removed && exc19.contains(s, 0, len)) { + len += 3; + s[len - 3] = 'ο'; + s[len - 2] = 'Ï '; + s[len - 1] = 'μ'; + } + return len; + } + + private int rule20(char s[], int len) { + if (len > 5 && (endsWith(s, len, "μαÏÏν") || endsWith(s, len, "μαÏοÏ"))) + len -= 3; + else if (len > 4 && endsWith(s, len, "μαÏα")) + len -= 2; + return len; + } + + private int rule21(char s[], int len) { + if (len > 9 && endsWith(s, len, "ιονÏÎ¿Ï Ïαν")) + return len - 9; + + if (len > 8 && (endsWith(s, len, "ιομαÏÏαν") || + endsWith(s, len, "ιοÏαÏÏαν") || + endsWith(s, len, "Î¹Î¿Ï Î¼Î±ÏÏε") || + endsWith(s, len, "ονÏÎ¿Ï Ïαν"))) + return len - 8; + + if (len > 7 && (endsWith(s, len, "ιεμαÏÏε") || + endsWith(s, len, "ιεÏαÏÏε") || + endsWith(s, len, "Î¹Î¿Î¼Î¿Ï Î½Î±") || + endsWith(s, len, "ιοÏαÏÏε") || + endsWith(s, len, "ιοÏÎ¿Ï Î½Î±") || + endsWith(s, len, "Î¹Î¿Ï Î½Ïαι") || + endsWith(s, len, "Î¹Î¿Ï Î½Ïαν") || + endsWith(s, len, "ηθηκαÏε") || + endsWith(s, len, "ομαÏÏαν") || + endsWith(s, len, "οÏαÏÏαν") || + endsWith(s, len, "Î¿Ï Î¼Î±ÏÏε"))) + return len - 7; + + if (len > 6 && (endsWith(s, len, "Î¹Î¿Î¼Î¿Ï Î½") || + endsWith(s, len, "ιονÏαν") || + endsWith(s, len, "ιοÏÎ¿Ï Î½") || + endsWith(s, len, "ηθειÏε") || + endsWith(s, len, "ηθηκαν") || + endsWith(s, len, "Î¿Î¼Î¿Ï Î½Î±") || + endsWith(s, len, "οÏαÏÏε") || + endsWith(s, len, "οÏÎ¿Ï Î½Î±") || + endsWith(s, len, "Î¿Ï Î½Ïαι") || + endsWith(s, len, "Î¿Ï Î½Ïαν") || + endsWith(s, len, "Î¿Ï ÏαÏε"))) + return len - 6; + + if (len > 5 && (endsWith(s, len, "αγαÏε") || + endsWith(s, len, "ιεμαι") || + endsWith(s, len, "ιεÏαι") || + endsWith(s, len, "ιεÏαι") || + endsWith(s, len, "ιοÏαν") || + endsWith(s, len, "Î¹Î¿Ï Î¼Î±") || + endsWith(s, len, "ηθειÏ") || + endsWith(s, len, "Î·Î¸Î¿Ï Î½") || + endsWith(s, len, "ηκαÏε") || + endsWith(s, len, "ηÏαÏε") || + endsWith(s, len, "ηÏÎ¿Ï Î½") || + endsWith(s, len, "Î¿Î¼Î¿Ï Î½") || + endsWith(s, len, "ονÏαι") || + endsWith(s, len, "ονÏαν") || + endsWith(s, len, "οÏÎ¿Ï Î½") || + endsWith(s, len, "Î¿Ï Î¼Î±Î¹") || + endsWith(s, len, "Î¿Ï Ïαν"))) + return len - 5; + + if (len > 4 && (endsWith(s, len, "αγαν") || + endsWith(s, len, "αμαι") || + endsWith(s, len, "αÏαι") || + endsWith(s, len, "αÏαι") || + endsWith(s, len, "ειÏε") || + endsWith(s, len, "εÏαι") || + endsWith(s, len, "εÏαι") || + endsWith(s, len, "ηδεÏ") || + endsWith(s, len, "ηδÏν") || + endsWith(s, len, "ηθει") || + endsWith(s, len, "ηκαν") || + endsWith(s, len, "ηÏαν") || + endsWith(s, len, "ηÏει") || + endsWith(s, len, "ηÏεÏ") || + endsWith(s, len, "ομαι") || + endsWith(s, len, "οÏαν"))) + return len - 4; + + if (len > 3 && (endsWith(s, len, "αει") || + endsWith(s, len, "ειÏ") || + endsWith(s, len, "ηθÏ") || + endsWith(s, len, "ηÏÏ") || + endsWith(s, len, "Î¿Ï Î½") || + endsWith(s, len, "Î¿Ï Ï"))) + return len - 3; + + if (len > 2 && (endsWith(s, len, "αν") || + endsWith(s, len, "αÏ") || + endsWith(s, len, "αÏ") || + endsWith(s, len, "ει") || + endsWith(s, len, "εÏ") || + endsWith(s, len, "ηÏ") || + endsWith(s, len, "οι") || + endsWith(s, len, "οÏ") || + endsWith(s, len, "Î¿Ï ") || + endsWith(s, len, "Ï Ï") || + endsWith(s, len, "Ïν"))) + return len - 2; + + if (len > 1 && endsWithVowel(s, len)) + return len - 1; + + return len; + } + + private int rule22(char s[], int len) { + if (endsWith(s, len, "εÏÏεÏ") || + endsWith(s, len, "εÏÏαÏ")) + return len - 5; + + if (endsWith(s, len, "οÏεÏ") || + endsWith(s, len, "οÏαÏ") || + endsWith(s, len, "Ï ÏεÏ") || + endsWith(s, len, "Ï ÏαÏ") || + endsWith(s, len, "ÏÏεÏ") || + endsWith(s, len, "ÏÏαÏ")) + return len - 4; + + return len; + } + + private boolean endsWith(char s[], int len, String suffix) { + final int suffixLen = suffix.length(); + if (suffixLen > len) + return false; + for (int i = suffixLen - 1; i >= 0; i--) + if (s[len -(suffixLen - i)] != suffix.charAt(i)) + return false; + + return true; + } + + private boolean endsWithVowel(char s[], int len) { + if (len == 0) + return false; + switch(s[len - 1]) { + case 'α': + case 'ε': + case 'η': + case 'ι': + case 'ο': + case 'Ï ': + case 'Ï': + return true; + default: + return false; + } + } + + private boolean endsWithVowelNoY(char s[], int len) { + if (len == 0) + return false; + switch(s[len - 1]) { + case 'α': + case 'ε': + case 'η': + case 'ι': + case 'ο': + case 'Ï': + return true; + default: + return false; + } + } +}