lucene-java-3.4.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java

   1 package org.apache.lucene.analysis.ar;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.Collections;
  22 import java.util.HashSet;
  23 import java.util.Set;
  24
  25 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  26 import org.apache.lucene.analysis.CharArraySet;
  27
  28 /**
  29  * Test the Arabic Analyzer
  30  *
  31  */
  32 public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
  33
  34   /** This test fails with NPE when the
  35    * stopwords file is missing in classpath */
  36   public void testResourcesAvailable() {
  37     new ArabicAnalyzer(TEST_VERSION_CURRENT);
  38   }
  39
  40   /**
  41    * Some simple tests showing some features of the analyzer, how some regular forms will conflate
  42    */
  43   public void testBasicFeatures() throws Exception {
  44     ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT);
  45     assertAnalyzesTo(a, "كبير", new String[] { "كبير" });
  46     assertAnalyzesTo(a, "كبيرة", new String[] { "كبير" }); // feminine marker
  47
  48     assertAnalyzesTo(a, "مشروب", new String[] { "مشروب" });
  49     assertAnalyzesTo(a, "مشروبات", new String[] { "مشروب" }); // plural -at
  50
  51     assertAnalyzesTo(a, "أمريكيين", new String[] { "امريك" }); // plural -in
  52     assertAnalyzesTo(a, "امريكي", new String[] { "امريك" }); // singular with bare alif
  53
  54     assertAnalyzesTo(a, "كتاب", new String[] { "كتاب" });
  55     assertAnalyzesTo(a, "الكتاب", new String[] { "كتاب" }); // definite article
  56
  57     assertAnalyzesTo(a, "ما ملكت أيمانكم", new String[] { "ملكت", "ايمانكم"});
  58     assertAnalyzesTo(a, "الذين ملكت أيمانكم", new String[] { "ملكت", "ايمانكم" }); // stopwords
  59   }
  60
  61   /**
  62    * Simple tests to show things are getting reset correctly, etc.
  63    */
  64   public void testReusableTokenStream() throws Exception {
  65     ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT);
  66     assertAnalyzesToReuse(a, "كبير", new String[] { "كبير" });
  67     assertAnalyzesToReuse(a, "كبيرة", new String[] { "كبير" }); // feminine marker
  68   }
  69
  70   /**
  71    * Non-arabic text gets treated in a similar way as SimpleAnalyzer.
  72    */
  73   public void testEnglishInput() throws Exception {
  74     assertAnalyzesTo(new ArabicAnalyzer(TEST_VERSION_CURRENT), "English text.", new String[] {
  75         "english", "text" });
  76   }
  77
  78   /**
  79    * Test that custom stopwords work, and are not case-sensitive.
  80    */
  81   public void testCustomStopwords() throws Exception {
  82     Set<String> set = new HashSet<String>();
  83     Collections.addAll(set, "the", "and", "a");
  84     ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, set);
  85     assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
  86         "brown", "fox" });
  87   }
  88
  89   public void testWithStemExclusionSet() throws IOException {
  90     Set<String> set = new HashSet<String>();
  91     set.add("ساهدهات");
  92     ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
  93     assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
  94     assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
  95
  96
  97     a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, CharArraySet.EMPTY_SET);
  98     assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
  99     assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
 100   }
 101
 102   /** blast some random strings through the analyzer */
 103   public void testRandomStrings() throws Exception {
 104     checkRandomData(random, new ArabicAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
 105   }
 106 }