lucene-java-3.5.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/hi/TestHindiStemmer.java

   1 package org.apache.lucene.analysis.hi;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.StringReader;
  22
  23 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  24 import org.apache.lucene.analysis.MockTokenizer;
  25 import org.apache.lucene.analysis.TokenFilter;
  26 import org.apache.lucene.analysis.Tokenizer;
  27
  28 /**
  29  * Test HindiStemmer
  30  */
  31 public class TestHindiStemmer extends BaseTokenStreamTestCase {
  32   /**
  33    * Test masc noun inflections
  34    */
  35   public void testMasculineNouns() throws IOException {
  36     check("लडका", "लडक");
  37     check("लडके", "लडक");
  38     check("लडकों", "लडक");
  39
  40     check("गुरु", "गुर");
  41     check("गुरुओं", "गुर");
  42
  43     check("दोस्त", "दोस्त");
  44     check("दोस्तों", "दोस्त");
  45   }
  46
  47   /**
  48    * Test feminine noun inflections
  49    */
  50   public void testFeminineNouns() throws IOException {
  51     check("लडकी", "लडक");
  52     check("लडकियों", "लडक");
  53
  54     check("किताब", "किताब");
  55     check("किताबें", "किताब");
  56     check("किताबों", "किताब");
  57
  58     check("आध्यापीका", "आध्यापीक");
  59     check("आध्यापीकाएं", "आध्यापीक");
  60     check("आध्यापीकाओं", "आध्यापीक");
  61   }
  62
  63   /**
  64    * Test some verb forms
  65    */
  66   public void testVerbs() throws IOException {
  67     check("खाना", "खा");
  68     check("खाता", "खा");
  69     check("खाती", "खा");
  70     check("खा", "खा");
  71   }
  72
  73   /**
  74    * From the paper: since the suffix list for verbs includes AI, awA and anI,
  75    * additional suffixes had to be added to the list for noun/adjectives
  76    * ending with these endings.
  77    */
  78   public void testExceptions() throws IOException {
  79     check("कठिनाइयां", "कठिन");
  80     check("कठिन", "कठिन");
  81   }
  82
  83   private void check(String input, String output) throws IOException {
  84     Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
  85     TokenFilter tf = new HindiStemFilter(tokenizer);
  86     assertTokenStreamContents(tf, new String[] { output });
  87   }
  88 }