--- /dev/null
+package org.apache.lucene.analysis.lv;
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Light stemmer for Latvian.
+ * <p>
+ * This is a light version of the algorithm in Karlis Kreslin's PhD thesis
+ * <i>A stemming algorithm for Latvian</i> with the following modifications:
+ * <ul>
+ * <li>Only explicitly stems noun and adjective morphology
+ * <li>Stricter length/vowel checks for the resulting stems (verb etc suffix stripping is removed)
+ * <li>Removes only the primary inflectional suffixes: case and number for nouns ;
+ * case, number, gender, and definitiveness for adjectives.
+ * <li>Palatalization is only handled when a declension II,V,VI noun suffix is removed.
+ * </ul>
+ */
+public class LatvianStemmer {
+ /**
+ * Stem a latvian word. returns the new adjusted length.
+ */
+ public int stem(char s[], int len) {
+ int numVowels = numVowels(s, len);
+
+ for (int i = 0; i < affixes.length; i++) {
+ Affix affix = affixes[i];
+ if (numVowels > affix.vc && len >= affix.affix.length + 3 && endsWith(s, len, affix.affix)) {
+ len -= affix.affix.length;
+ return affix.palatalizes ? unpalatalize(s, len) : len;
+ }
+ }
+
+ return len;
+ }
+
+ static final Affix affixes[] = {
+ new Affix("ajiem", 3, false), new Affix("ajai", 3, false),
+ new Affix("ajam", 2, false), new Affix("ajām", 2, false),
+ new Affix("ajos", 2, false), new Affix("ajās", 2, false),
+ new Affix("iem", 2, true), new Affix("ajā", 2, false),
+ new Affix("ais", 2, false), new Affix("ai", 2, false),
+ new Affix("ei", 2, false), new Affix("ām", 1, false),
+ new Affix("am", 1, false), new Affix("ēm", 1, false),
+ new Affix("īm", 1, false), new Affix("im", 1, false),
+ new Affix("um", 1, false), new Affix("us", 1, true),
+ new Affix("as", 1, false), new Affix("ās", 1, false),
+ new Affix("es", 1, false), new Affix("os", 1, true),
+ new Affix("ij", 1, false), new Affix("īs", 1, false),
+ new Affix("ēs", 1, false), new Affix("is", 1, false),
+ new Affix("ie", 1, false), new Affix("u", 1, true),
+ new Affix("a", 1, true), new Affix("i", 1, true),
+ new Affix("e", 1, false), new Affix("ā", 1, false),
+ new Affix("ē", 1, false), new Affix("ī", 1, false),
+ new Affix("ū", 1, false), new Affix("o", 1, false),
+ new Affix("s", 0, false), new Affix("š", 0, false),
+ };
+
+ static class Affix {
+ char affix[]; // suffix
+ int vc; // vowel count of the suffix
+ boolean palatalizes; // true if we should fire palatalization rules.
+
+ Affix(String affix, int vc, boolean palatalizes) {
+ this.affix = affix.toCharArray();
+ this.vc = vc;
+ this.palatalizes = palatalizes;
+ }
+ }
+
+ /**
+ * Most cases are handled except for the ambiguous ones:
+ * <ul>
+ * <li> s -> š
+ * <li> t -> š
+ * <li> d -> ž
+ * <li> z -> ž
+ * </ul>
+ */
+ private int unpalatalize(char s[], int len) {
+ // we check the character removed: if its -u then
+ // its 2,5, or 6 gen pl., and these two can only apply then.
+ if (s[len] == 'u') {
+ // kš -> kst
+ if (endsWith(s, len, "kš")) {
+ len++;
+ s[len-2] = 's';
+ s[len-1] = 't';
+ return len;
+ }
+ // ņņ -> nn
+ if (endsWith(s, len, "ņņ")) {
+ s[len-2] = 'n';
+ s[len-1] = 'n';
+ return len;
+ }
+ }
+
+ // otherwise all other rules
+ if (endsWith(s, len, "pj") || endsWith(s, len, "bj")
+ || endsWith(s, len, "mj") || endsWith(s, len, "vj")) {
+ // labial consonant
+ return len-1;
+ } else if (endsWith(s, len, "šņ")) {
+ s[len-2] = 's';
+ s[len-1] = 'n';
+ return len;
+ } else if (endsWith(s, len, "žņ")) {
+ s[len-2] = 'z';
+ s[len-1] = 'n';
+ return len;
+ } else if (endsWith(s, len, "šļ")) {
+ s[len-2] = 's';
+ s[len-1] = 'l';
+ return len;
+ } else if (endsWith(s, len, "žļ")) {
+ s[len-2] = 'z';
+ s[len-1] = 'l';
+ return len;
+ } else if (endsWith(s, len, "ļņ")) {
+ s[len-2] = 'l';
+ s[len-1] = 'n';
+ return len;
+ } else if (endsWith(s, len, "ļļ")) {
+ s[len-2] = 'l';
+ s[len-1] = 'l';
+ return len;
+ } else if (s[len-1] == 'č') {
+ s[len-1] = 'c';
+ return len;
+ } else if (s[len-1] == 'ļ') {
+ s[len-1] = 'l';
+ return len;
+ } else if (s[len-1] == 'ņ') {
+ s[len-1] = 'n';
+ return len;
+ }
+
+ return len;
+ }
+
+ /**
+ * Count the vowels in the string, we always require at least
+ * one in the remaining stem to accept it.
+ */
+ private int numVowels(char s[], int len) {
+ int n = 0;
+ for (int i = 0; i < len; i++) {
+ switch(s[i]) {
+ case 'a': case 'e': case 'i':
+ case 'o': case 'u': case 'ā':
+ case 'ī': case 'ē': case 'ū':
+ n++;
+ }
+ }
+ return n;
+ }
+}