1 package org.apache.lucene.analysis;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.Reader;
22 import java.io.StringReader;
24 import org.apache.lucene.util.Version;
27 * Testcase for {@link CharTokenizer} subclasses
29 public class TestCharTokenizers extends BaseTokenStreamTestCase {
32 * test to read surrogate pairs without loosing the pairing
33 * if the surrogate pair is at the border of the internal IO buffer
35 public void testReadSupplementaryChars() throws IOException {
36 StringBuilder builder = new StringBuilder();
37 // create random input
38 int num = 1024 + random.nextInt(1024);
39 num *= RANDOM_MULTIPLIER;
40 for (int i = 1; i < num; i++) {
41 builder.append("\ud801\udc1cabc");
45 // internal buffer size is 1024 make sure we have a surrogate pair right at the border
46 builder.insert(1023, "\ud801\udc1c");
47 LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
48 TEST_VERSION_CURRENT, new StringReader(builder.toString()));
49 assertTokenStreamContents(tokenizer, builder.toString().toLowerCase().split(" "));
53 * test to extend the buffer TermAttribute buffer internally. If the internal
54 * alg that extends the size of the char array only extends by 1 char and the
55 * next char to be filled in is a supplementary codepoint (using 2 chars) an
56 * index out of bound exception is triggered.
58 public void testExtendCharBuffer() throws IOException {
59 for (int i = 0; i < 40; i++) {
60 StringBuilder builder = new StringBuilder();
61 for (int j = 0; j < 1+i; j++) {
64 builder.append("\ud801\udc1cabc");
65 LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
66 TEST_VERSION_CURRENT, new StringReader(builder.toString()));
67 assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase()});
72 * tests the max word length of 255 - tokenizer will split at the 255 char no matter what happens
74 public void testMaxWordLength() throws IOException {
75 StringBuilder builder = new StringBuilder();
77 for (int i = 0; i < 255; i++) {
80 LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
81 TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
82 assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
86 * tests the max word length of 255 with a surrogate pair at position 255
88 public void testMaxWordLengthWithSupplementary() throws IOException {
89 StringBuilder builder = new StringBuilder();
91 for (int i = 0; i < 254; i++) {
94 builder.append("\ud801\udc1c");
95 LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
96 TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
97 assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
100 public void testLowerCaseTokenizer() throws IOException {
101 StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
102 LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT,
104 assertTokenStreamContents(tokenizer, new String[] { "tokenizer",
105 "\ud801\udc44test" });
108 public void testLowerCaseTokenizerBWCompat() throws IOException {
109 StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
110 LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_30,
112 assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "test" });
115 public void testWhitespaceTokenizer() throws IOException {
116 StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
117 WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
119 assertTokenStreamContents(tokenizer, new String[] { "Tokenizer",
120 "\ud801\udc1ctest" });
123 public void testWhitespaceTokenizerBWCompat() throws IOException {
124 StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
125 WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_30,
127 assertTokenStreamContents(tokenizer, new String[] { "Tokenizer",
128 "\ud801\udc1ctest" });
131 public void testIsTokenCharCharInSubclass() {
132 new TestingCharTokenizer(Version.LUCENE_30, new StringReader(""));
134 new TestingCharTokenizer(TEST_VERSION_CURRENT, new StringReader(""));
135 fail("version 3.1 is not permitted if char based method is implemented");
136 } catch (IllegalArgumentException e) {
141 public void testNormalizeCharInSubclass() {
142 new TestingCharTokenizerNormalize(Version.LUCENE_30, new StringReader(""));
144 new TestingCharTokenizerNormalize(TEST_VERSION_CURRENT,
145 new StringReader(""));
146 fail("version 3.1 is not permitted if char based method is implemented");
147 } catch (IllegalArgumentException e) {
152 public void testNormalizeAndIsTokenCharCharInSubclass() {
153 new TestingCharTokenizerNormalizeIsTokenChar(Version.LUCENE_30,
154 new StringReader(""));
156 new TestingCharTokenizerNormalizeIsTokenChar(TEST_VERSION_CURRENT,
157 new StringReader(""));
158 fail("version 3.1 is not permitted if char based method is implemented");
159 } catch (IllegalArgumentException e) {
164 static final class TestingCharTokenizer extends CharTokenizer {
165 public TestingCharTokenizer(Version matchVersion, Reader input) {
166 super(matchVersion, input);
170 protected boolean isTokenChar(int c) {
171 return Character.isLetter(c);
174 @Deprecated @Override
175 protected boolean isTokenChar(char c) {
176 return Character.isLetter(c);
180 static final class TestingCharTokenizerNormalize extends CharTokenizer {
181 public TestingCharTokenizerNormalize(Version matchVersion, Reader input) {
182 super(matchVersion, input);
185 @Deprecated @Override
186 protected char normalize(char c) {
191 protected int normalize(int c) {
196 static final class TestingCharTokenizerNormalizeIsTokenChar extends CharTokenizer {
197 public TestingCharTokenizerNormalizeIsTokenChar(Version matchVersion,
199 super(matchVersion, input);
202 @Deprecated @Override
203 protected char normalize(char c) {
208 protected int normalize(int c) {
213 protected boolean isTokenChar(int c) {
214 return Character.isLetter(c);
217 @Deprecated @Override
218 protected boolean isTokenChar(char c) {
219 return Character.isLetter(c);