lucene-java-3.5.0/lucene/src/test/org/apache/lucene/analysis/TestAnalyzers.java

   1 package org.apache.lucene.analysis;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.StringReader;
  22 import java.io.Reader;
  23
  24 import org.apache.lucene.analysis.standard.StandardTokenizer;
  25 import org.apache.lucene.analysis.standard.StandardAnalyzer;
  26 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
  27 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  28 import org.apache.lucene.index.Payload;
  29
  30 public class TestAnalyzers extends BaseTokenStreamTestCase {
  31
  32   public void testSimple() throws Exception {
  33     Analyzer a = new SimpleAnalyzer(TEST_VERSION_CURRENT);
  34     assertAnalyzesTo(a, "foo bar FOO BAR",
  35                      new String[] { "foo", "bar", "foo", "bar" });
  36     assertAnalyzesTo(a, "foo      bar .  FOO <> BAR",
  37                      new String[] { "foo", "bar", "foo", "bar" });
  38     assertAnalyzesTo(a, "foo.bar.FOO.BAR",
  39                      new String[] { "foo", "bar", "foo", "bar" });
  40     assertAnalyzesTo(a, "U.S.A.",
  41                      new String[] { "u", "s", "a" });
  42     assertAnalyzesTo(a, "C++",
  43                      new String[] { "c" });
  44     assertAnalyzesTo(a, "B2B",
  45                      new String[] { "b", "b" });
  46     assertAnalyzesTo(a, "2B",
  47                      new String[] { "b" });
  48     assertAnalyzesTo(a, "\"QUOTED\" word",
  49                      new String[] { "quoted", "word" });
  50   }
  51
  52   public void testNull() throws Exception {
  53     Analyzer a = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
  54     assertAnalyzesTo(a, "foo bar FOO BAR",
  55                      new String[] { "foo", "bar", "FOO", "BAR" });
  56     assertAnalyzesTo(a, "foo      bar .  FOO <> BAR",
  57                      new String[] { "foo", "bar", ".", "FOO", "<>", "BAR" });
  58     assertAnalyzesTo(a, "foo.bar.FOO.BAR",
  59                      new String[] { "foo.bar.FOO.BAR" });
  60     assertAnalyzesTo(a, "U.S.A.",
  61                      new String[] { "U.S.A." });
  62     assertAnalyzesTo(a, "C++",
  63                      new String[] { "C++" });
  64     assertAnalyzesTo(a, "B2B",
  65                      new String[] { "B2B" });
  66     assertAnalyzesTo(a, "2B",
  67                      new String[] { "2B" });
  68     assertAnalyzesTo(a, "\"QUOTED\" word",
  69                      new String[] { "\"QUOTED\"", "word" });
  70   }
  71
  72   public void testStop() throws Exception {
  73     Analyzer a = new StopAnalyzer(TEST_VERSION_CURRENT);
  74     assertAnalyzesTo(a, "foo bar FOO BAR",
  75                      new String[] { "foo", "bar", "foo", "bar" });
  76     assertAnalyzesTo(a, "foo a bar such FOO THESE BAR",
  77                      new String[] { "foo", "bar", "foo", "bar" });
  78   }
  79
  80   void verifyPayload(TokenStream ts) throws IOException {
  81     PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
  82     for(byte b=1;;b++) {
  83       boolean hasNext = ts.incrementToken();
  84       if (!hasNext) break;
  85       // System.out.println("id="+System.identityHashCode(nextToken) + " " + t);
  86       // System.out.println("payload=" + (int)nextToken.getPayload().toByteArray()[0]);
  87       assertEquals(b, payloadAtt.getPayload().toByteArray()[0]);
  88     }
  89   }
  90
  91   // Make sure old style next() calls result in a new copy of payloads
  92   public void testPayloadCopy() throws IOException {
  93     String s = "how now brown cow";
  94     TokenStream ts;
  95     ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s));
  96     ts = new PayloadSetter(ts);
  97     verifyPayload(ts);
  98
  99     ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s));
 100     ts = new PayloadSetter(ts);
 101     verifyPayload(ts);
 102   }
 103
 104   // LUCENE-1150: Just a compile time test, to ensure the
 105   // StandardAnalyzer constants remain publicly accessible
 106   @SuppressWarnings("unused")
 107   public void _testStandardConstants() {
 108     int x = StandardTokenizer.ALPHANUM;
 109     x = StandardTokenizer.APOSTROPHE;
 110     x = StandardTokenizer.ACRONYM;
 111     x = StandardTokenizer.COMPANY;
 112     x = StandardTokenizer.EMAIL;
 113     x = StandardTokenizer.HOST;
 114     x = StandardTokenizer.NUM;
 115     x = StandardTokenizer.CJ;
 116     String[] y = StandardTokenizer.TOKEN_TYPES;
 117   }
 118
 119   private static class LowerCaseWhitespaceAnalyzer extends Analyzer {
 120
 121     @Override
 122     public TokenStream tokenStream(String fieldName, Reader reader) {
 123       return new LowerCaseFilter(TEST_VERSION_CURRENT,
 124           new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader));
 125     }
 126
 127   }
 128
 129   /**
 130    * @deprecated remove this when lucene 3.0 "broken unicode 4" support
 131    * is no longer needed.
 132    */
 133   @Deprecated
 134   private static class LowerCaseWhitespaceAnalyzerBWComp extends Analyzer {
 135
 136     @Override
 137     public TokenStream tokenStream(String fieldName, Reader reader) {
 138       return new LowerCaseFilter(new WhitespaceTokenizer(reader));
 139     }
 140
 141   }
 142
 143   /**
 144    * Test that LowercaseFilter handles entire unicode range correctly
 145    */
 146   public void testLowerCaseFilter() throws IOException {
 147     Analyzer a = new LowerCaseWhitespaceAnalyzer();
 148     // BMP
 149     assertAnalyzesTo(a, "AbaCaDabA", new String[] { "abacadaba" });
 150     // supplementary
 151     assertAnalyzesTo(a, "\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16",
 152         new String[] {"\ud801\udc3e\ud801\udc3e\ud801\udc3e\ud801\udc3e"});
 153     assertAnalyzesTo(a, "AbaCa\ud801\udc16DabA",
 154         new String[] { "abaca\ud801\udc3edaba" });
 155     // unpaired lead surrogate
 156     assertAnalyzesTo(a, "AbaC\uD801AdaBa",
 157         new String [] { "abac\uD801adaba" });
 158     // unpaired trail surrogate
 159     assertAnalyzesTo(a, "AbaC\uDC16AdaBa",
 160         new String [] { "abac\uDC16adaba" });
 161   }
 162
 163   /**
 164    * Test that LowercaseFilter handles the lowercasing correctly if the term
 165    * buffer has a trailing surrogate character leftover and the current term in
 166    * the buffer ends with a corresponding leading surrogate.
 167    */
 168   public void testLowerCaseFilterLowSurrogateLeftover() throws IOException {
 169     // test if the limit of the termbuffer is correctly used with supplementary
 170     // chars
 171     WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
 172         new StringReader("BogustermBogusterm\udc16"));
 173     LowerCaseFilter filter = new LowerCaseFilter(TEST_VERSION_CURRENT,
 174         tokenizer);
 175     assertTokenStreamContents(filter, new String[] {"bogustermbogusterm\udc16"});
 176     filter.reset();
 177     String highSurEndingUpper = "BogustermBoguster\ud801";
 178     String highSurEndingLower = "bogustermboguster\ud801";
 179     tokenizer.reset(new StringReader(highSurEndingUpper));
 180     assertTokenStreamContents(filter, new String[] {highSurEndingLower});
 181     assertTrue(filter.hasAttribute(CharTermAttribute.class));
 182     char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer();
 183     int length = highSurEndingLower.length();
 184     assertEquals('\ud801', termBuffer[length - 1]);
 185     assertEquals('\udc3e', termBuffer[length]);
 186
 187   }
 188
 189   public void testLimitTokenCountAnalyzer() throws IOException {
 190     Analyzer a = new LimitTokenCountAnalyzer(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2);
 191     // dont use assertAnalyzesTo here, as the end offset is not the end of the string!
 192     assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1  2     3  4  5")), new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, 4);
 193     assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3);
 194
 195     a = new LimitTokenCountAnalyzer(new StandardAnalyzer(TEST_VERSION_CURRENT), 2);
 196     // dont use assertAnalyzesTo here, as the end offset is not the end of the string!
 197     assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3);
 198     assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3);
 199   }
 200
 201   /**
 202    * Test that LowercaseFilter only works on BMP for back compat,
 203    * depending upon version
 204    * @deprecated remove this test when lucene 3.0 "broken unicode 4" support
 205    * is no longer needed.
 206    */
 207   @Deprecated
 208   public void testLowerCaseFilterBWComp() throws IOException {
 209     Analyzer a = new LowerCaseWhitespaceAnalyzerBWComp();
 210     // BMP
 211     assertAnalyzesTo(a, "AbaCaDabA", new String[] { "abacadaba" });
 212     // supplementary, no-op
 213     assertAnalyzesTo(a, "\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16",
 214         new String[] {"\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16"});
 215     assertAnalyzesTo(a, "AbaCa\ud801\udc16DabA",
 216         new String[] { "abaca\ud801\udc16daba" });
 217     // unpaired lead surrogate
 218     assertAnalyzesTo(a, "AbaC\uD801AdaBa",
 219         new String [] { "abac\uD801adaba" });
 220     // unpaired trail surrogate
 221     assertAnalyzesTo(a, "AbaC\uDC16AdaBa",
 222         new String [] { "abac\uDC16adaba" });
 223   }
 224
 225   /** blast some random strings through the analyzer */
 226   public void testRandomStrings() throws Exception {
 227     checkRandomData(random, new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
 228     checkRandomData(random, new SimpleAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
 229     checkRandomData(random, new StopAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
 230   }
 231 }
 232
 233 final class PayloadSetter extends TokenFilter {
 234   PayloadAttribute payloadAtt;
 235   public  PayloadSetter(TokenStream input) {
 236     super(input);
 237     payloadAtt = addAttribute(PayloadAttribute.class);
 238   }
 239
 240   byte[] data = new byte[1];
 241   Payload p = new Payload(data,0,1);
 242
 243   @Override
 244   public boolean incrementToken() throws IOException {
 245     boolean hasNext = input.incrementToken();
 246     if (!hasNext) return false;
 247     payloadAtt.setPayload(p);  // reuse the payload / byte[]
 248     data[0]++;
 249     return true;
 250   }
 251 }