lucene-java-3.4.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java

   1 package org.apache.lucene.analysis.de;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.StringReader;
  22
  23 import org.apache.lucene.analysis.Analyzer;
  24 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  25 import org.apache.lucene.analysis.CharArraySet;
  26 import org.apache.lucene.analysis.KeywordMarkerFilter;
  27 import org.apache.lucene.analysis.LowerCaseTokenizer;
  28 import org.apache.lucene.util.Version;
  29
  30 public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
  31   public void testReusableTokenStream() throws Exception {
  32     Analyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT);
  33     checkOneTermReuse(a, "Tisch", "tisch");
  34     checkOneTermReuse(a, "Tische", "tisch");
  35     checkOneTermReuse(a, "Tischen", "tisch");
  36   }
  37
  38   public void testExclusionTableBWCompat() throws IOException {
  39     GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT,
  40         new StringReader("Fischen Trinken")));
  41     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
  42     set.add("fischen");
  43     filter.setExclusionSet(set);
  44     assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
  45   }
  46
  47   public void testWithKeywordAttribute() throws IOException {
  48     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
  49     set.add("fischen");
  50     GermanStemFilter filter = new GermanStemFilter(
  51         new KeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
  52             "Fischen Trinken")), set));
  53     assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
  54   }
  55
  56   public void testWithKeywordAttributeAndExclusionTable() throws IOException {
  57     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
  58     set.add("fischen");
  59     CharArraySet set1 = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
  60     set1.add("trinken");
  61     set1.add("fischen");
  62     GermanStemFilter filter = new GermanStemFilter(
  63         new KeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
  64             "Fischen Trinken")), set));
  65     filter.setExclusionSet(set1);
  66     assertTokenStreamContents(filter, new String[] { "fischen", "trinken" });
  67   }
  68
  69   /*
  70    * Test that changes to the exclusion table are applied immediately
  71    * when using reusable token streams.
  72    */
  73   public void testExclusionTableReuse() throws Exception {
  74     GermanAnalyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT);
  75     checkOneTermReuse(a, "tischen", "tisch");
  76     a.setStemExclusionTable(new String[] { "tischen" });
  77     checkOneTermReuse(a, "tischen", "tischen");
  78   }
  79
  80   /** test some features of the new snowball filter
  81    * these only pass with LUCENE_CURRENT, not if you use o.a.l.a.de.GermanStemmer
  82    */
  83   public void testGermanSpecials() throws Exception {
  84     GermanAnalyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT);
  85     // a/o/u + e is equivalent to the umlaut form
  86     checkOneTermReuse(a, "Schaltflächen", "schaltflach");
  87     checkOneTermReuse(a, "Schaltflaechen", "schaltflach");
  88     // here they are with the old stemmer
  89     a = new GermanAnalyzer(Version.LUCENE_30);
  90     checkOneTermReuse(a, "Schaltflächen", "schaltflach");
  91     checkOneTermReuse(a, "Schaltflaechen", "schaltflaech");
  92   }
  93
  94   /** blast some random strings through the analyzer */
  95   public void testRandomStrings() throws Exception {
  96     checkRandomData(random, new GermanAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
  97   }
  98 }