X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/backwards/src/test/org/apache/lucene/analysis/TestCharTokenizers.java diff --git a/lucene-java-3.4.0/lucene/backwards/src/test/org/apache/lucene/analysis/TestCharTokenizers.java b/lucene-java-3.4.0/lucene/backwards/src/test/org/apache/lucene/analysis/TestCharTokenizers.java deleted file mode 100644 index ff6f961..0000000 --- a/lucene-java-3.4.0/lucene/backwards/src/test/org/apache/lucene/analysis/TestCharTokenizers.java +++ /dev/null @@ -1,222 +0,0 @@ -package org.apache.lucene.analysis; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.util.Version; - -/** - * Testcase for {@link CharTokenizer} subclasses - */ -public class TestCharTokenizers extends BaseTokenStreamTestCase { - - /* - * test to read surrogate pairs without loosing the pairing - * if the surrogate pair is at the border of the internal IO buffer - */ - public void testReadSupplementaryChars() throws IOException { - StringBuilder builder = new StringBuilder(); - // create random input - int num = 1024 + random.nextInt(1024); - num *= RANDOM_MULTIPLIER; - for (int i = 1; i < num; i++) { - builder.append("\ud801\udc1cabc"); - if((i % 10) == 0) - builder.append(" "); - } - // internal buffer size is 1024 make sure we have a surrogate pair right at the border - builder.insert(1023, "\ud801\udc1c"); - LowerCaseTokenizer tokenizer = new LowerCaseTokenizer( - TEST_VERSION_CURRENT, new StringReader(builder.toString())); - assertTokenStreamContents(tokenizer, builder.toString().toLowerCase().split(" ")); - } - - /* - * test to extend the buffer TermAttribute buffer internally. If the internal - * alg that extends the size of the char array only extends by 1 char and the - * next char to be filled in is a supplementary codepoint (using 2 chars) an - * index out of bound exception is triggered. - */ - public void testExtendCharBuffer() throws IOException { - for (int i = 0; i < 40; i++) { - StringBuilder builder = new StringBuilder(); - for (int j = 0; j < 1+i; j++) { - builder.append("a"); - } - builder.append("\ud801\udc1cabc"); - LowerCaseTokenizer tokenizer = new LowerCaseTokenizer( - TEST_VERSION_CURRENT, new StringReader(builder.toString())); - assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase()}); - } - } - - /* - * tests the max word length of 255 - tokenizer will split at the 255 char no matter what happens - */ - public void testMaxWordLength() throws IOException { - StringBuilder builder = new StringBuilder(); - - for (int i = 0; i < 255; i++) { - builder.append("A"); - } - LowerCaseTokenizer tokenizer = new LowerCaseTokenizer( - TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString())); - assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()}); - } - - /* - * tests the max word length of 255 with a surrogate pair at position 255 - */ - public void testMaxWordLengthWithSupplementary() throws IOException { - StringBuilder builder = new StringBuilder(); - - for (int i = 0; i < 254; i++) { - builder.append("A"); - } - builder.append("\ud801\udc1c"); - LowerCaseTokenizer tokenizer = new LowerCaseTokenizer( - TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString())); - assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()}); - } - - public void testLowerCaseTokenizer() throws IOException { - StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); - LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, - reader); - assertTokenStreamContents(tokenizer, new String[] { "tokenizer", - "\ud801\udc44test" }); - } - - public void testLowerCaseTokenizerBWCompat() throws IOException { - StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); - LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_30, - reader); - assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "test" }); - } - - public void testWhitespaceTokenizer() throws IOException { - StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); - WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, - reader); - assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", - "\ud801\udc1ctest" }); - } - - public void testWhitespaceTokenizerBWCompat() throws IOException { - StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); - WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_30, - reader); - assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", - "\ud801\udc1ctest" }); - } - - public void testIsTokenCharCharInSubclass() { - new TestingCharTokenizer(Version.LUCENE_30, new StringReader("")); - try { - new TestingCharTokenizer(TEST_VERSION_CURRENT, new StringReader("")); - fail("version 3.1 is not permitted if char based method is implemented"); - } catch (IllegalArgumentException e) { - // expected - } - } - - public void testNormalizeCharInSubclass() { - new TestingCharTokenizerNormalize(Version.LUCENE_30, new StringReader("")); - try { - new TestingCharTokenizerNormalize(TEST_VERSION_CURRENT, - new StringReader("")); - fail("version 3.1 is not permitted if char based method is implemented"); - } catch (IllegalArgumentException e) { - // expected - } - } - - public void testNormalizeAndIsTokenCharCharInSubclass() { - new TestingCharTokenizerNormalizeIsTokenChar(Version.LUCENE_30, - new StringReader("")); - try { - new TestingCharTokenizerNormalizeIsTokenChar(TEST_VERSION_CURRENT, - new StringReader("")); - fail("version 3.1 is not permitted if char based method is implemented"); - } catch (IllegalArgumentException e) { - // expected - } - } - - static final class TestingCharTokenizer extends CharTokenizer { - public TestingCharTokenizer(Version matchVersion, Reader input) { - super(matchVersion, input); - } - - @Override - protected boolean isTokenChar(int c) { - return Character.isLetter(c); - } - - @Deprecated @Override - protected boolean isTokenChar(char c) { - return Character.isLetter(c); - } - } - - static final class TestingCharTokenizerNormalize extends CharTokenizer { - public TestingCharTokenizerNormalize(Version matchVersion, Reader input) { - super(matchVersion, input); - } - - @Deprecated @Override - protected char normalize(char c) { - return c; - } - - @Override - protected int normalize(int c) { - return c; - } - } - - static final class TestingCharTokenizerNormalizeIsTokenChar extends CharTokenizer { - public TestingCharTokenizerNormalizeIsTokenChar(Version matchVersion, - Reader input) { - super(matchVersion, input); - } - - @Deprecated @Override - protected char normalize(char c) { - return c; - } - - @Override - protected int normalize(int c) { - return c; - } - - @Override - protected boolean isTokenChar(int c) { - return Character.isLetter(c); - } - - @Deprecated @Override - protected boolean isTokenChar(char c) { - return Character.isLetter(c); - } - } -}