lucene-java-3.4.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java

   1 package org.apache.lucene.analysis.ru;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.InputStreamReader;
  22
  23 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  24 import org.apache.lucene.analysis.Analyzer;
  25 import org.apache.lucene.analysis.CharArraySet;
  26 import org.apache.lucene.analysis.TokenStream;
  27 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  28 import org.apache.lucene.util.Version;
  29
  30 /**
  31  * Test case for RussianAnalyzer.
  32  */
  33
  34 public class TestRussianAnalyzer extends BaseTokenStreamTestCase
  35 {
  36     private InputStreamReader inWords;
  37
  38     private InputStreamReader sampleUnicode;
  39
  40     /**
  41      * @deprecated remove this test and its datafiles in Lucene 4.0
  42      * the Snowball version has its own data tests.
  43      */
  44     @Deprecated
  45     public void testUnicode30() throws IOException
  46     {
  47         RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_30);
  48         inWords =
  49             new InputStreamReader(
  50                 getClass().getResourceAsStream("testUTF8.txt"),
  51                 "UTF-8");
  52
  53         sampleUnicode =
  54             new InputStreamReader(
  55                 getClass().getResourceAsStream("resUTF8.htm"),
  56                 "UTF-8");
  57
  58         TokenStream in = ra.tokenStream("all", inWords);
  59
  60         RussianLetterTokenizer sample =
  61             new RussianLetterTokenizer(TEST_VERSION_CURRENT,
  62                 sampleUnicode);
  63
  64         CharTermAttribute text = in.getAttribute(CharTermAttribute.class);
  65         CharTermAttribute sampleText = sample.getAttribute(CharTermAttribute.class);
  66
  67         for (;;)
  68         {
  69           if (in.incrementToken() == false)
  70             break;
  71
  72             boolean nextSampleToken = sample.incrementToken();
  73             assertEquals(
  74                 "Unicode",
  75                 text.toString(),
  76                 nextSampleToken == false
  77                 ? null
  78                 : sampleText.toString());
  79         }
  80
  81         inWords.close();
  82         sampleUnicode.close();
  83     }
  84
  85     /** Check that RussianAnalyzer doesnt discard any numbers */
  86     public void testDigitsInRussianCharset() throws IOException
  87     {
  88       RussianAnalyzer ra = new RussianAnalyzer(TEST_VERSION_CURRENT);
  89       assertAnalyzesTo(ra, "text 1000", new String[] { "text", "1000" });
  90     }
  91
  92     /** @deprecated remove this test in Lucene 4.0: stopwords changed */
  93     @Deprecated
  94     public void testReusableTokenStream30() throws Exception {
  95       Analyzer a = new RussianAnalyzer(Version.LUCENE_30);
  96       assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
  97           new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
  98       assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
  99           new String[] { "знан", "хран", "тайн" });
 100     }
 101
 102     public void testReusableTokenStream() throws Exception {
 103       Analyzer a = new RussianAnalyzer(TEST_VERSION_CURRENT);
 104       assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
 105           new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
 106       assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
 107           new String[] { "знан", "эт", "хран", "тайн" });
 108     }
 109
 110
 111     public void testWithStemExclusionSet() throws Exception {
 112       CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
 113       set.add("представление");
 114       Analyzer a = new RussianAnalyzer(TEST_VERSION_CURRENT, RussianAnalyzer.getDefaultStopSet() , set);
 115       assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
 116           new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представление" });
 117
 118     }
 119
 120     /** blast some random strings through the analyzer */
 121     public void testRandomStrings() throws Exception {
 122       checkRandomData(random, new RussianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
 123     }
 124 }