--- /dev/null
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.util.Version;
+
+/**
+ * Testcase for {@link CharTokenizer} subclasses
+ */
+public class TestCharTokenizers extends BaseTokenStreamTestCase {
+
+ /*
+ * test to read surrogate pairs without loosing the pairing
+ * if the surrogate pair is at the border of the internal IO buffer
+ */
+ public void testReadSupplementaryChars() throws IOException {
+ StringBuilder builder = new StringBuilder();
+ // create random input
+ int num = 1024 + random.nextInt(1024);
+ num *= RANDOM_MULTIPLIER;
+ for (int i = 1; i < num; i++) {
+ builder.append("\ud801\udc1cabc");
+ if((i % 10) == 0)
+ builder.append(" ");
+ }
+ // internal buffer size is 1024 make sure we have a surrogate pair right at the border
+ builder.insert(1023, "\ud801\udc1c");
+ LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
+ TEST_VERSION_CURRENT, new StringReader(builder.toString()));
+ assertTokenStreamContents(tokenizer, builder.toString().toLowerCase().split(" "));
+ }
+
+ /*
+ * test to extend the buffer TermAttribute buffer internally. If the internal
+ * alg that extends the size of the char array only extends by 1 char and the
+ * next char to be filled in is a supplementary codepoint (using 2 chars) an
+ * index out of bound exception is triggered.
+ */
+ public void testExtendCharBuffer() throws IOException {
+ for (int i = 0; i < 40; i++) {
+ StringBuilder builder = new StringBuilder();
+ for (int j = 0; j < 1+i; j++) {
+ builder.append("a");
+ }
+ builder.append("\ud801\udc1cabc");
+ LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
+ TEST_VERSION_CURRENT, new StringReader(builder.toString()));
+ assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase()});
+ }
+ }
+
+ /*
+ * tests the max word length of 255 - tokenizer will split at the 255 char no matter what happens
+ */
+ public void testMaxWordLength() throws IOException {
+ StringBuilder builder = new StringBuilder();
+
+ for (int i = 0; i < 255; i++) {
+ builder.append("A");
+ }
+ LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
+ TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
+ assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
+ }
+
+ /*
+ * tests the max word length of 255 with a surrogate pair at position 255
+ */
+ public void testMaxWordLengthWithSupplementary() throws IOException {
+ StringBuilder builder = new StringBuilder();
+
+ for (int i = 0; i < 254; i++) {
+ builder.append("A");
+ }
+ builder.append("\ud801\udc1c");
+ LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
+ TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
+ assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
+ }
+
+ public void testLowerCaseTokenizer() throws IOException {
+ StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
+ LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT,
+ reader);
+ assertTokenStreamContents(tokenizer, new String[] { "tokenizer",
+ "\ud801\udc44test" });
+ }
+
+ public void testLowerCaseTokenizerBWCompat() throws IOException {
+ StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
+ LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_30,
+ reader);
+ assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "test" });
+ }
+
+ public void testWhitespaceTokenizer() throws IOException {
+ StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
+ WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+ reader);
+ assertTokenStreamContents(tokenizer, new String[] { "Tokenizer",
+ "\ud801\udc1ctest" });
+ }
+
+ public void testWhitespaceTokenizerBWCompat() throws IOException {
+ StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
+ WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_30,
+ reader);
+ assertTokenStreamContents(tokenizer, new String[] { "Tokenizer",
+ "\ud801\udc1ctest" });
+ }
+
+ public void testIsTokenCharCharInSubclass() {
+ new TestingCharTokenizer(Version.LUCENE_30, new StringReader(""));
+ try {
+ new TestingCharTokenizer(TEST_VERSION_CURRENT, new StringReader(""));
+ fail("version 3.1 is not permitted if char based method is implemented");
+ } catch (IllegalArgumentException e) {
+ // expected
+ }
+ }
+
+ public void testNormalizeCharInSubclass() {
+ new TestingCharTokenizerNormalize(Version.LUCENE_30, new StringReader(""));
+ try {
+ new TestingCharTokenizerNormalize(TEST_VERSION_CURRENT,
+ new StringReader(""));
+ fail("version 3.1 is not permitted if char based method is implemented");
+ } catch (IllegalArgumentException e) {
+ // expected
+ }
+ }
+
+ public void testNormalizeAndIsTokenCharCharInSubclass() {
+ new TestingCharTokenizerNormalizeIsTokenChar(Version.LUCENE_30,
+ new StringReader(""));
+ try {
+ new TestingCharTokenizerNormalizeIsTokenChar(TEST_VERSION_CURRENT,
+ new StringReader(""));
+ fail("version 3.1 is not permitted if char based method is implemented");
+ } catch (IllegalArgumentException e) {
+ // expected
+ }
+ }
+
+ static final class TestingCharTokenizer extends CharTokenizer {
+ public TestingCharTokenizer(Version matchVersion, Reader input) {
+ super(matchVersion, input);
+ }
+
+ @Override
+ protected boolean isTokenChar(int c) {
+ return Character.isLetter(c);
+ }
+
+ @Deprecated @Override
+ protected boolean isTokenChar(char c) {
+ return Character.isLetter(c);
+ }
+ }
+
+ static final class TestingCharTokenizerNormalize extends CharTokenizer {
+ public TestingCharTokenizerNormalize(Version matchVersion, Reader input) {
+ super(matchVersion, input);
+ }
+
+ @Deprecated @Override
+ protected char normalize(char c) {
+ return c;
+ }
+
+ @Override
+ protected int normalize(int c) {
+ return c;
+ }
+ }
+
+ static final class TestingCharTokenizerNormalizeIsTokenChar extends CharTokenizer {
+ public TestingCharTokenizerNormalizeIsTokenChar(Version matchVersion,
+ Reader input) {
+ super(matchVersion, input);
+ }
+
+ @Deprecated @Override
+ protected char normalize(char c) {
+ return c;
+ }
+
+ @Override
+ protected int normalize(int c) {
+ return c;
+ }
+
+ @Override
+ protected boolean isTokenChar(int c) {
+ return Character.isLetter(c);
+ }
+
+ @Deprecated @Override
+ protected boolean isTokenChar(char c) {
+ return Character.isLetter(c);
+ }
+ }
+}