pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.4.0 / lucene / backwards / src / test / org / apache / lucene / analysis / TestAnalyzers.java
diff --git a/lucene-java-3.4.0/lucene/backwards/src/test/org/apache/lucene/analysis/TestAnalyzers.java b/lucene-java-3.4.0/lucene/backwards/src/test/org/apache/lucene/analysis/TestAnalyzers.java

deleted file mode 100644 (file)

index c97da67..0000000
--- a/lucene-java-3.4.0/lucene/backwards/src/test/org/apache/lucene/analysis/TestAnalyzers.java
+++ /dev/null
@@ -1,251 +0,0 @@
-package org.apache.lucene.analysis;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.io.Reader;
-
-import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.index.Payload;
-
-public class TestAnalyzers extends BaseTokenStreamTestCase {
-
-  public void testSimple() throws Exception {
-    Analyzer a = new SimpleAnalyzer(TEST_VERSION_CURRENT);
-    assertAnalyzesTo(a, "foo bar FOO BAR", 
-                     new String[] { "foo", "bar", "foo", "bar" });
-    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", 
-                     new String[] { "foo", "bar", "foo", "bar" });
-    assertAnalyzesTo(a, "foo.bar.FOO.BAR", 
-                     new String[] { "foo", "bar", "foo", "bar" });
-    assertAnalyzesTo(a, "U.S.A.", 
-                     new String[] { "u", "s", "a" });
-    assertAnalyzesTo(a, "C++", 
-                     new String[] { "c" });
-    assertAnalyzesTo(a, "B2B", 
-                     new String[] { "b", "b" });
-    assertAnalyzesTo(a, "2B", 
-                     new String[] { "b" });
-    assertAnalyzesTo(a, "\"QUOTED\" word", 
-                     new String[] { "quoted", "word" });
-  }
-
-  public void testNull() throws Exception {
-    Analyzer a = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
-    assertAnalyzesTo(a, "foo bar FOO BAR", 
-                     new String[] { "foo", "bar", "FOO", "BAR" });
-    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", 
-                     new String[] { "foo", "bar", ".", "FOO", "<>", "BAR" });
-    assertAnalyzesTo(a, "foo.bar.FOO.BAR", 
-                     new String[] { "foo.bar.FOO.BAR" });
-    assertAnalyzesTo(a, "U.S.A.", 
-                     new String[] { "U.S.A." });
-    assertAnalyzesTo(a, "C++", 
-                     new String[] { "C++" });
-    assertAnalyzesTo(a, "B2B", 
-                     new String[] { "B2B" });
-    assertAnalyzesTo(a, "2B", 
-                     new String[] { "2B" });
-    assertAnalyzesTo(a, "\"QUOTED\" word", 
-                     new String[] { "\"QUOTED\"", "word" });
-  }
-
-  public void testStop() throws Exception {
-    Analyzer a = new StopAnalyzer(TEST_VERSION_CURRENT);
-    assertAnalyzesTo(a, "foo bar FOO BAR", 
-                     new String[] { "foo", "bar", "foo", "bar" });
-    assertAnalyzesTo(a, "foo a bar such FOO THESE BAR", 
-                     new String[] { "foo", "bar", "foo", "bar" });
-  }
-
-  void verifyPayload(TokenStream ts) throws IOException {
-    PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
-    for(byte b=1;;b++) {
-      boolean hasNext = ts.incrementToken();
-      if (!hasNext) break;
-      // System.out.println("id="+System.identityHashCode(nextToken) + " " + t);
-      // System.out.println("payload=" + (int)nextToken.getPayload().toByteArray()[0]);
-      assertEquals(b, payloadAtt.getPayload().toByteArray()[0]);
-    }
-  }
-
-  // Make sure old style next() calls result in a new copy of payloads
-  public void testPayloadCopy() throws IOException {
-    String s = "how now brown cow";
-    TokenStream ts;
-    ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s));
-    ts = new PayloadSetter(ts);
-    verifyPayload(ts);
-
-    ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s));
-    ts = new PayloadSetter(ts);
-    verifyPayload(ts);
-  }
-
-  // LUCENE-1150: Just a compile time test, to ensure the
-  // StandardAnalyzer constants remain publicly accessible
-  @SuppressWarnings("unused")
-  public void _testStandardConstants() {
-    int x = StandardTokenizer.ALPHANUM;
-    x = StandardTokenizer.APOSTROPHE;
-    x = StandardTokenizer.ACRONYM;
-    x = StandardTokenizer.COMPANY;
-    x = StandardTokenizer.EMAIL;
-    x = StandardTokenizer.HOST;
-    x = StandardTokenizer.NUM;
-    x = StandardTokenizer.CJ;
-    String[] y = StandardTokenizer.TOKEN_TYPES;
-  }
-
-  private static class LowerCaseWhitespaceAnalyzer extends Analyzer {
-
-    @Override
-    public TokenStream tokenStream(String fieldName, Reader reader) {
-      return new LowerCaseFilter(TEST_VERSION_CURRENT,
-          new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader));
-    }
-    
-  }
-  
-  /**
-   * @deprecated remove this when lucene 3.0 "broken unicode 4" support
-   * is no longer needed.
-   */
-  @Deprecated
-  private static class LowerCaseWhitespaceAnalyzerBWComp extends Analyzer {
-
-    @Override
-    public TokenStream tokenStream(String fieldName, Reader reader) {
-      return new LowerCaseFilter(new WhitespaceTokenizer(reader));
-    }
-    
-  }
-  
-  /**
-   * Test that LowercaseFilter handles entire unicode range correctly
-   */
-  public void testLowerCaseFilter() throws IOException {
-    Analyzer a = new LowerCaseWhitespaceAnalyzer();
-    // BMP
-    assertAnalyzesTo(a, "AbaCaDabA", new String[] { "abacadaba" });
-    // supplementary
-    assertAnalyzesTo(a, "\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16",
-        new String[] {"\ud801\udc3e\ud801\udc3e\ud801\udc3e\ud801\udc3e"});
-    assertAnalyzesTo(a, "AbaCa\ud801\udc16DabA", 
-        new String[] { "abaca\ud801\udc3edaba" });
-    // unpaired lead surrogate
-    assertAnalyzesTo(a, "AbaC\uD801AdaBa", 
-        new String [] { "abac\uD801adaba" });
-    // unpaired trail surrogate
-    assertAnalyzesTo(a, "AbaC\uDC16AdaBa", 
-        new String [] { "abac\uDC16adaba" });
-  }
-  
-  /**
-   * Test that LowercaseFilter handles the lowercasing correctly if the term
-   * buffer has a trailing surrogate character leftover and the current term in
-   * the buffer ends with a corresponding leading surrogate.
-   */
-  public void testLowerCaseFilterLowSurrogateLeftover() throws IOException {
-    // test if the limit of the termbuffer is correctly used with supplementary
-    // chars
-    WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, 
-        new StringReader("BogustermBogusterm\udc16"));
-    LowerCaseFilter filter = new LowerCaseFilter(TEST_VERSION_CURRENT,
-        tokenizer);
-    assertTokenStreamContents(filter, new String[] {"bogustermbogusterm\udc16"});
-    filter.reset();
-    String highSurEndingUpper = "BogustermBoguster\ud801";
-    String highSurEndingLower = "bogustermboguster\ud801";
-    tokenizer.reset(new StringReader(highSurEndingUpper));
-    assertTokenStreamContents(filter, new String[] {highSurEndingLower});
-    assertTrue(filter.hasAttribute(CharTermAttribute.class));
-    char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer();
-    int length = highSurEndingLower.length();
-    assertEquals('\ud801', termBuffer[length - 1]);
-    assertEquals('\udc3e', termBuffer[length]);
-    
-  }
-  
-  public void testLimitTokenCountAnalyzer() throws IOException {
-    Analyzer a = new LimitTokenCountAnalyzer(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2);
-    // dont use assertAnalyzesTo here, as the end offset is not the end of the string!
-    assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1  2     3  4  5")), new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, 4);
-    assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3);
-    
-    a = new LimitTokenCountAnalyzer(new StandardAnalyzer(TEST_VERSION_CURRENT), 2);
-    // dont use assertAnalyzesTo here, as the end offset is not the end of the string!
-    assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3);
-    assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3);
-  }
-  
-  /**
-   * Test that LowercaseFilter only works on BMP for back compat,
-   * depending upon version
-   * @deprecated remove this test when lucene 3.0 "broken unicode 4" support
-   * is no longer needed.
-   */
-  @Deprecated
-  public void testLowerCaseFilterBWComp() throws IOException {
-    Analyzer a = new LowerCaseWhitespaceAnalyzerBWComp();
-    // BMP
-    assertAnalyzesTo(a, "AbaCaDabA", new String[] { "abacadaba" });
-    // supplementary, no-op
-    assertAnalyzesTo(a, "\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16",
-        new String[] {"\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16"});
-    assertAnalyzesTo(a, "AbaCa\ud801\udc16DabA",
-        new String[] { "abaca\ud801\udc16daba" });
-    // unpaired lead surrogate
-    assertAnalyzesTo(a, "AbaC\uD801AdaBa", 
-        new String [] { "abac\uD801adaba" });
-    // unpaired trail surrogate
-    assertAnalyzesTo(a, "AbaC\uDC16AdaBa", 
-        new String [] { "abac\uDC16adaba" });
-  }
-
-  /** blast some random strings through the analyzer */
-  public void testRandomStrings() throws Exception {
-    checkRandomData(random, new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
-    checkRandomData(random, new SimpleAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
-    checkRandomData(random, new StopAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
-  } 
-}
-
-final class PayloadSetter extends TokenFilter {
-  PayloadAttribute payloadAtt;
-  public  PayloadSetter(TokenStream input) {
-    super(input);
-    payloadAtt = addAttribute(PayloadAttribute.class);
-  }
-
-  byte[] data = new byte[1];
-  Payload p = new Payload(data,0,1);
-
-  @Override
-  public boolean incrementToken() throws IOException {
-    boolean hasNext = input.incrementToken();
-    if (!hasNext) return false;
-    payloadAtt.setPayload(p);  // reuse the payload / byte[]
-    data[0]++;
-    return true;
-  }
-}
-\ No newline at end of file