1 package org.apache.lucene.analysis;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.util.ArrayList;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.HashSet;
24 import java.util.List;
26 import java.util.Iterator;
28 import org.apache.lucene.util.LuceneTestCase;
29 import org.apache.lucene.util.Version;
32 public class TestCharArraySet extends LuceneTestCase {
34 static final String[] TEST_STOP_WORDS = {
35 "a", "an", "and", "are", "as", "at", "be", "but", "by",
36 "for", "if", "in", "into", "is", "it",
37 "no", "not", "of", "on", "or", "such",
38 "that", "the", "their", "then", "there", "these",
39 "they", "this", "to", "was", "will", "with"
43 public void testRehash() throws Exception {
44 CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 0, true);
45 for(int i=0;i<TEST_STOP_WORDS.length;i++)
46 cas.add(TEST_STOP_WORDS[i]);
47 assertEquals(TEST_STOP_WORDS.length, cas.size());
48 for(int i=0;i<TEST_STOP_WORDS.length;i++)
49 assertTrue(cas.contains(TEST_STOP_WORDS[i]));
52 public void testNonZeroOffset() {
53 String[] words={"Hello","World","this","is","a","test"};
54 char[] findme="xthisy".toCharArray();
55 CharArraySet set=new CharArraySet(TEST_VERSION_CURRENT, 10,true);
56 set.addAll(Arrays.asList(words));
57 assertTrue(set.contains(findme, 1, 4));
58 assertTrue(set.contains(new String(findme,1,4)));
61 set = CharArraySet.unmodifiableSet(set);
62 assertTrue(set.contains(findme, 1, 4));
63 assertTrue(set.contains(new String(findme,1,4)));
66 public void testObjectContains() {
67 CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
68 Integer val = Integer.valueOf(1);
70 assertTrue(set.contains(val));
71 assertTrue(set.contains(new Integer(1))); // another integer
72 assertTrue(set.contains("1"));
73 assertTrue(set.contains(new char[]{'1'}));
75 set = CharArraySet.unmodifiableSet(set);
76 assertTrue(set.contains(val));
77 assertTrue(set.contains(new Integer(1))); // another integer
78 assertTrue(set.contains("1"));
79 assertTrue(set.contains(new char[]{'1'}));
82 public void testClear(){
83 CharArraySet set=new CharArraySet(TEST_VERSION_CURRENT, 10,true);
84 set.addAll(Arrays.asList(TEST_STOP_WORDS));
85 assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
87 assertEquals("not empty", 0, set.size());
88 for(int i=0;i<TEST_STOP_WORDS.length;i++)
89 assertFalse(set.contains(TEST_STOP_WORDS[i]));
90 set.addAll(Arrays.asList(TEST_STOP_WORDS));
91 assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
92 for(int i=0;i<TEST_STOP_WORDS.length;i++)
93 assertTrue(set.contains(TEST_STOP_WORDS[i]));
96 public void testModifyOnUnmodifiable(){
97 CharArraySet set=new CharArraySet(TEST_VERSION_CURRENT, 10, true);
98 set.addAll(Arrays.asList(TEST_STOP_WORDS));
99 final int size = set.size();
100 set = CharArraySet.unmodifiableSet(set);
101 assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
102 String NOT_IN_SET = "SirGallahad";
103 assertFalse("Test String already exists in set", set.contains(NOT_IN_SET));
106 set.add(NOT_IN_SET.toCharArray());
107 fail("Modified unmodifiable set");
108 }catch (UnsupportedOperationException e) {
110 assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
111 assertEquals("Size of unmodifiable set has changed", size, set.size());
116 fail("Modified unmodifiable set");
117 }catch (UnsupportedOperationException e) {
119 assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
120 assertEquals("Size of unmodifiable set has changed", size, set.size());
124 set.add(new StringBuilder(NOT_IN_SET));
125 fail("Modified unmodifiable set");
126 }catch (UnsupportedOperationException e) {
128 assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
129 assertEquals("Size of unmodifiable set has changed", size, set.size());
134 fail("Modified unmodifiable set");
135 }catch (UnsupportedOperationException e) {
137 assertFalse("Changed unmodifiable set", set.contains(NOT_IN_SET));
138 assertEquals("Size of unmodifiable set has changed", size, set.size());
141 set.add((Object) NOT_IN_SET);
142 fail("Modified unmodifiable set");
143 }catch (UnsupportedOperationException e) {
145 assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
146 assertEquals("Size of unmodifiable set has changed", size, set.size());
149 // This test was changed in 3.1, as a contains() call on the given Collection using the "correct" iterator's
150 // current key (now a char[]) on a Set<String> would not hit any element of the CAS and therefor never call
151 // remove() on the iterator
153 set.removeAll(new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(TEST_STOP_WORDS), true));
154 fail("Modified unmodifiable set");
155 }catch (UnsupportedOperationException e) {
157 assertEquals("Size of unmodifiable set has changed", size, set.size());
161 set.retainAll(new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(NOT_IN_SET), true));
162 fail("Modified unmodifiable set");
163 }catch (UnsupportedOperationException e) {
165 assertEquals("Size of unmodifiable set has changed", size, set.size());
169 set.addAll(Arrays.asList(new String[]{NOT_IN_SET}));
170 fail("Modified unmodifiable set");
171 }catch (UnsupportedOperationException e) {
173 assertFalse("Test String has been added to unmodifiable set", set.contains(NOT_IN_SET));
176 for (int i = 0; i < TEST_STOP_WORDS.length; i++) {
177 assertTrue(set.contains(TEST_STOP_WORDS[i]));
181 public void testUnmodifiableSet(){
182 CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 10,true);
183 set.addAll(Arrays.asList(TEST_STOP_WORDS));
184 set.add(Integer.valueOf(1));
185 final int size = set.size();
186 set = CharArraySet.unmodifiableSet(set);
187 assertEquals("Set size changed due to unmodifiableSet call" , size, set.size());
188 for (String stopword : TEST_STOP_WORDS) {
189 assertTrue(set.contains(stopword));
191 assertTrue(set.contains(Integer.valueOf(1)));
192 assertTrue(set.contains("1"));
193 assertTrue(set.contains(new char[]{'1'}));
196 CharArraySet.unmodifiableSet(null);
197 fail("can not make null unmodifiable");
198 }catch (NullPointerException e) {
203 public void testSupplementaryChars() {
204 String missing = "Term %s is missing in the set";
205 String falsePos = "Term %s is in the set but shouldn't";
207 // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[[%3ACase_Sensitive%3DTrue%3A]%26[^[\u0000-\uFFFF]]]&esc=on
208 String[] upperArr = new String[] {"Abc\ud801\udc1c",
209 "\ud801\udc1c\ud801\udc1cCDE", "A\ud801\udc1cB"};
210 String[] lowerArr = new String[] {"abc\ud801\udc44",
211 "\ud801\udc44\ud801\udc44cde", "a\ud801\udc44b"};
212 CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(TEST_STOP_WORDS), true);
213 for (String upper : upperArr) {
216 for (int i = 0; i < upperArr.length; i++) {
217 assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
218 assertTrue(String.format(missing, lowerArr[i]), set.contains(lowerArr[i]));
220 set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(TEST_STOP_WORDS), false);
221 for (String upper : upperArr) {
224 for (int i = 0; i < upperArr.length; i++) {
225 assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
226 assertFalse(String.format(falsePos, lowerArr[i]), set.contains(lowerArr[i]));
230 public void testSingleHighSurrogate() {
231 String missing = "Term %s is missing in the set";
232 String falsePos = "Term %s is in the set but shouldn't";
233 String[] upperArr = new String[] { "ABC\uD800", "ABC\uD800EfG",
234 "\uD800EfG", "\uD800\ud801\udc1cB" };
236 String[] lowerArr = new String[] { "abc\uD800", "abc\uD800efg",
237 "\uD800efg", "\uD800\ud801\udc44b" };
238 CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays
239 .asList(TEST_STOP_WORDS), true);
240 for (String upper : upperArr) {
243 for (int i = 0; i < upperArr.length; i++) {
244 assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
245 assertTrue(String.format(missing, lowerArr[i]), set.contains(lowerArr[i]));
247 set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList(TEST_STOP_WORDS),
249 for (String upper : upperArr) {
252 for (int i = 0; i < upperArr.length; i++) {
253 assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
254 assertFalse(String.format(falsePos, upperArr[i]), set
255 .contains(lowerArr[i]));
260 * @deprecated remove this test when lucene 3.0 "broken unicode 4" support is
264 public void testSupplementaryCharsBWCompat() {
265 String missing = "Term %s is missing in the set";
266 String falsePos = "Term %s is in the set but shouldn't";
268 // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[[%3ACase_Sensitive%3DTrue%3A]%26[^[\u0000-\uFFFF]]]&esc=on
269 String[] upperArr = new String[] {"Abc\ud801\udc1c",
270 "\ud801\udc1c\ud801\udc1cCDE", "A\ud801\udc1cB"};
271 String[] lowerArr = new String[] {"abc\ud801\udc44",
272 "\ud801\udc44\ud801\udc44cde", "a\ud801\udc44b"};
273 CharArraySet set = new CharArraySet(Version.LUCENE_30, Arrays.asList(TEST_STOP_WORDS), true);
274 for (String upper : upperArr) {
277 for (int i = 0; i < upperArr.length; i++) {
278 assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
279 assertFalse(String.format(falsePos, lowerArr[i]), set.contains(lowerArr[i]));
281 set = new CharArraySet(Version.LUCENE_30, Arrays.asList(TEST_STOP_WORDS), false);
282 for (String upper : upperArr) {
285 for (int i = 0; i < upperArr.length; i++) {
286 assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
287 assertFalse(String.format(falsePos, lowerArr[i]), set.contains(lowerArr[i]));
292 * @deprecated remove this test when lucene 3.0 "broken unicode 4" support is
296 public void testSingleHighSurrogateBWComapt() {
297 String missing = "Term %s is missing in the set";
298 String falsePos = "Term %s is in the set but shouldn't";
299 String[] upperArr = new String[] { "ABC\uD800", "ABC\uD800EfG",
300 "\uD800EfG", "\uD800\ud801\udc1cB" };
302 String[] lowerArr = new String[] { "abc\uD800", "abc\uD800efg",
303 "\uD800efg", "\uD800\ud801\udc44b" };
304 CharArraySet set = new CharArraySet(Version.LUCENE_30, Arrays
305 .asList(TEST_STOP_WORDS), true);
306 for (String upper : upperArr) {
309 for (int i = 0; i < upperArr.length; i++) {
310 assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
311 if (i == lowerArr.length - 1)
312 assertFalse(String.format(falsePos, lowerArr[i]), set
313 .contains(lowerArr[i]));
315 assertTrue(String.format(missing, lowerArr[i]), set
316 .contains(lowerArr[i]));
318 set = new CharArraySet(Version.LUCENE_30, Arrays.asList(TEST_STOP_WORDS),
320 for (String upper : upperArr) {
323 for (int i = 0; i < upperArr.length; i++) {
324 assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
325 assertFalse(String.format(falsePos, lowerArr[i]), set
326 .contains(lowerArr[i]));
330 @SuppressWarnings("deprecated")
331 public void testCopyCharArraySetBWCompat() {
332 CharArraySet setIngoreCase = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
333 CharArraySet setCaseSensitive = new CharArraySet(TEST_VERSION_CURRENT, 10, false);
335 List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
336 List<String> stopwordsUpper = new ArrayList<String>();
337 for (String string : stopwords) {
338 stopwordsUpper.add(string.toUpperCase());
340 setIngoreCase.addAll(Arrays.asList(TEST_STOP_WORDS));
341 setIngoreCase.add(Integer.valueOf(1));
342 setCaseSensitive.addAll(Arrays.asList(TEST_STOP_WORDS));
343 setCaseSensitive.add(Integer.valueOf(1));
345 // This should use the deprecated methods, because it checks a bw compatibility.
346 CharArraySet copy = CharArraySet.copy(setIngoreCase);
347 CharArraySet copyCaseSens = CharArraySet.copy(setCaseSensitive);
349 assertEquals(setIngoreCase.size(), copy.size());
350 assertEquals(setCaseSensitive.size(), copy.size());
352 assertTrue(copy.containsAll(stopwords));
353 assertTrue(copy.containsAll(stopwordsUpper));
354 assertTrue(copyCaseSens.containsAll(stopwords));
355 for (String string : stopwordsUpper) {
356 assertFalse(copyCaseSens.contains(string));
358 // test adding terms to the copy
359 List<String> newWords = new ArrayList<String>();
360 for (String string : stopwords) {
361 newWords.add(string+"_1");
363 copy.addAll(newWords);
365 assertTrue(copy.containsAll(stopwords));
366 assertTrue(copy.containsAll(stopwordsUpper));
367 assertTrue(copy.containsAll(newWords));
368 // new added terms are not in the source set
369 for (String string : newWords) {
370 assertFalse(setIngoreCase.contains(string));
371 assertFalse(setCaseSensitive.contains(string));
377 * Test the static #copy() function with a CharArraySet as a source
379 public void testCopyCharArraySet() {
380 CharArraySet setIngoreCase = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
381 CharArraySet setCaseSensitive = new CharArraySet(TEST_VERSION_CURRENT, 10, false);
383 List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
384 List<String> stopwordsUpper = new ArrayList<String>();
385 for (String string : stopwords) {
386 stopwordsUpper.add(string.toUpperCase());
388 setIngoreCase.addAll(Arrays.asList(TEST_STOP_WORDS));
389 setIngoreCase.add(Integer.valueOf(1));
390 setCaseSensitive.addAll(Arrays.asList(TEST_STOP_WORDS));
391 setCaseSensitive.add(Integer.valueOf(1));
393 CharArraySet copy = CharArraySet.copy(TEST_VERSION_CURRENT, setIngoreCase);
394 CharArraySet copyCaseSens = CharArraySet.copy(TEST_VERSION_CURRENT, setCaseSensitive);
396 assertEquals(setIngoreCase.size(), copy.size());
397 assertEquals(setCaseSensitive.size(), copy.size());
399 assertTrue(copy.containsAll(stopwords));
400 assertTrue(copy.containsAll(stopwordsUpper));
401 assertTrue(copyCaseSens.containsAll(stopwords));
402 for (String string : stopwordsUpper) {
403 assertFalse(copyCaseSens.contains(string));
405 // test adding terms to the copy
406 List<String> newWords = new ArrayList<String>();
407 for (String string : stopwords) {
408 newWords.add(string+"_1");
410 copy.addAll(newWords);
412 assertTrue(copy.containsAll(stopwords));
413 assertTrue(copy.containsAll(stopwordsUpper));
414 assertTrue(copy.containsAll(newWords));
415 // new added terms are not in the source set
416 for (String string : newWords) {
417 assertFalse(setIngoreCase.contains(string));
418 assertFalse(setCaseSensitive.contains(string));
424 * Test the static #copy() function with a JDK {@link Set} as a source
426 public void testCopyJDKSet() {
427 Set<String> set = new HashSet<String>();
429 List<String> stopwords = Arrays.asList(TEST_STOP_WORDS);
430 List<String> stopwordsUpper = new ArrayList<String>();
431 for (String string : stopwords) {
432 stopwordsUpper.add(string.toUpperCase());
434 set.addAll(Arrays.asList(TEST_STOP_WORDS));
436 CharArraySet copy = CharArraySet.copy(TEST_VERSION_CURRENT, set);
438 assertEquals(set.size(), copy.size());
439 assertEquals(set.size(), copy.size());
441 assertTrue(copy.containsAll(stopwords));
442 for (String string : stopwordsUpper) {
443 assertFalse(copy.contains(string));
446 List<String> newWords = new ArrayList<String>();
447 for (String string : stopwords) {
448 newWords.add(string+"_1");
450 copy.addAll(newWords);
452 assertTrue(copy.containsAll(stopwords));
453 assertTrue(copy.containsAll(newWords));
454 // new added terms are not in the source set
455 for (String string : newWords) {
456 assertFalse(set.contains(string));
461 * Tests a special case of {@link CharArraySet#copy(Version, Set)} where the
462 * set to copy is the {@link CharArraySet#EMPTY_SET}
464 public void testCopyEmptySet() {
465 assertSame(CharArraySet.EMPTY_SET,
466 CharArraySet.copy(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET));
470 * Smoketests the static empty set
472 public void testEmptySet() {
473 assertEquals(0, CharArraySet.EMPTY_SET.size());
475 assertTrue(CharArraySet.EMPTY_SET.isEmpty());
476 for (String stopword : TEST_STOP_WORDS) {
477 assertFalse(CharArraySet.EMPTY_SET.contains(stopword));
479 assertFalse(CharArraySet.EMPTY_SET.contains("foo"));
480 assertFalse(CharArraySet.EMPTY_SET.contains((Object) "foo"));
481 assertFalse(CharArraySet.EMPTY_SET.contains("foo".toCharArray()));
482 assertFalse(CharArraySet.EMPTY_SET.contains("foo".toCharArray(),0,3));
488 public void testContainsWithNull() {
489 CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
491 set.contains((char[]) null, 0, 10);
492 fail("null value must raise NPE");
493 } catch (NullPointerException e) {}
495 set.contains((CharSequence) null);
496 fail("null value must raise NPE");
497 } catch (NullPointerException e) {}
499 set.contains((Object) null);
500 fail("null value must raise NPE");
501 } catch (NullPointerException e) {}
504 @Deprecated @SuppressWarnings("unchecked")
505 public void testIterator() {
506 HashSet<String> hset = new HashSet<String>();
507 hset.addAll(Arrays.asList(TEST_STOP_WORDS));
509 assertTrue("in 3.0 version, iterator should be CharArraySetIterator",
510 ((Iterator) CharArraySet.copy(Version.LUCENE_30, hset).iterator()) instanceof CharArraySet.CharArraySetIterator);
512 CharArraySet set = CharArraySet.copy(TEST_VERSION_CURRENT, hset);
513 assertFalse("in current version, iterator should not be CharArraySetIterator",
514 ((Iterator) set.iterator()) instanceof CharArraySet.CharArraySetIterator);
516 Iterator<String> it = set.stringIterator();
517 assertTrue(it instanceof CharArraySet.CharArraySetIterator);
518 while (it.hasNext()) {
519 // as the set returns String instances, this must work:
520 assertTrue(hset.contains(it.next()));
523 fail("remove() should not work on CharArraySetIterator");
524 } catch (UnsupportedOperationException uoe) {
530 public void testToString() {
531 CharArraySet set = CharArraySet.copy(TEST_VERSION_CURRENT, Collections.singleton("test"));
532 assertEquals("[test]", set.toString());
534 assertTrue(set.toString().contains(", "));
536 set = CharArraySet.copy(Version.LUCENE_30, Collections.singleton("test"));
537 assertEquals("[test]", set.toString());
539 assertTrue(set.toString().contains(", "));