1 package org.apache.lucene.search.spell;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Collections;
23 import java.util.Comparator;
24 import java.util.List;
25 import java.util.concurrent.ExecutorService;
26 import java.util.concurrent.Executors;
27 import java.util.concurrent.TimeUnit;
29 import org.apache.lucene.analysis.MockAnalyzer;
30 import org.apache.lucene.document.Document;
31 import org.apache.lucene.document.Field;
32 import org.apache.lucene.index.CorruptIndexException;
33 import org.apache.lucene.index.IndexReader;
34 import org.apache.lucene.index.IndexWriter;
35 import org.apache.lucene.index.IndexWriterConfig;
36 import org.apache.lucene.search.IndexSearcher;
37 import org.apache.lucene.store.AlreadyClosedException;
38 import org.apache.lucene.store.Directory;
39 import org.apache.lucene.util.English;
40 import org.apache.lucene.util.LuceneTestCase;
43 * Spell checker test case
45 public class TestSpellChecker extends LuceneTestCase {
46 private SpellCheckerMock spellChecker;
47 private Directory userindex, spellindex;
48 private List<IndexSearcher> searchers;
51 public void setUp() throws Exception {
55 userindex = newDirectory();
56 IndexWriter writer = new IndexWriter(userindex, new IndexWriterConfig(
57 TEST_VERSION_CURRENT, new MockAnalyzer(random)));
59 for (int i = 0; i < 1000; i++) {
60 Document doc = new Document();
61 doc.add(newField("field1", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED));
62 doc.add(newField("field2", English.intToEnglish(i + 1), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand
63 doc.add(newField("field3", "fvei" + (i % 2 == 0 ? " five" : ""), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand
64 writer.addDocument(doc);
67 Document doc = new Document();
68 doc.add(newField("field1", "eight", Field.Index.ANALYZED)); // "eight" in
71 writer.addDocument(doc);
74 Document doc = new Document();
76 .add(newField("field1", "twenty-one twenty-one",
77 Field.Index.ANALYZED)); // "twenty-one" in the index thrice
78 writer.addDocument(doc);
81 Document doc = new Document();
82 doc.add(newField("field1", "twenty", Field.Index.ANALYZED)); // "twenty"
86 writer.addDocument(doc);
90 searchers = Collections.synchronizedList(new ArrayList<IndexSearcher>());
91 // create the spellChecker
92 spellindex = newDirectory();
93 spellChecker = new SpellCheckerMock(spellindex);
97 public void tearDown() throws Exception {
99 if (!spellChecker.isClosed())
100 spellChecker.close();
106 public void testBuild() throws CorruptIndexException, IOException {
107 IndexReader r = IndexReader.open(userindex, true);
109 spellChecker.clearIndex();
111 addwords(r, spellChecker, "field1");
112 int num_field1 = this.numdoc();
114 addwords(r, spellChecker, "field2");
115 int num_field2 = this.numdoc();
117 assertEquals(num_field2, num_field1 + 1);
119 assertLastSearcherOpen(4);
121 checkCommonSuggestions(r);
122 checkLevenshteinSuggestions(r);
124 spellChecker.setStringDistance(new JaroWinklerDistance());
125 spellChecker.setAccuracy(0.8f);
126 checkCommonSuggestions(r);
127 checkJaroWinklerSuggestions();
128 // the accuracy is set to 0.8 by default, but the best result has a score of 0.925
129 String[] similar = spellChecker.suggestSimilar("fvie", 2, 0.93f);
130 assertTrue(similar.length == 0);
131 similar = spellChecker.suggestSimilar("fvie", 2, 0.92f);
132 assertTrue(similar.length == 1);
134 similar = spellChecker.suggestSimilar("fiv", 2);
135 assertTrue(similar.length > 0);
136 assertEquals(similar[0], "five");
138 spellChecker.setStringDistance(new NGramDistance(2));
139 spellChecker.setAccuracy(0.5f);
140 checkCommonSuggestions(r);
141 checkNGramSuggestions();
146 public void testComparator() throws Exception {
147 IndexReader r = IndexReader.open(userindex, true);
148 Directory compIdx = newDirectory();
149 SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
150 addwords(r, compareSP, "field3");
152 String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3",
153 SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
154 assertEquals(2, similar.length);
155 //five and fvei have the same score, but different frequencies.
156 assertEquals("fvei", similar[0]);
157 assertEquals("five", similar[1]);
159 if (!compareSP.isClosed())
164 public void testBogusField() throws Exception {
165 IndexReader r = IndexReader.open(userindex, true);
166 Directory compIdx = newDirectory();
167 SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
168 addwords(r, compareSP, "field3");
170 String[] similar = compareSP.suggestSimilar("fvie", 2, r,
171 "bogusFieldBogusField", SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
172 assertEquals(0, similar.length);
174 if (!compareSP.isClosed())
179 public void testSuggestModes() throws Exception {
180 IndexReader r = IndexReader.open(userindex, true);
181 spellChecker.clearIndex();
182 addwords(r, spellChecker, "field1");
185 String[] similar = spellChecker.suggestSimilar("eighty", 2, r, "field1",
186 SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
187 assertEquals(1, similar.length);
188 assertEquals("eighty", similar[0]);
192 String[] similar = spellChecker.suggestSimilar("eight", 2, r, "field1",
193 SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
194 assertEquals(1, similar.length);
195 assertEquals("eight", similar[0]);
199 String[] similar = spellChecker.suggestSimilar("eighty", 5, r, "field1",
200 SuggestMode.SUGGEST_MORE_POPULAR);
201 assertEquals(5, similar.length);
202 assertEquals("eight", similar[0]);
206 String[] similar = spellChecker.suggestSimilar("twenty", 5, r, "field1",
207 SuggestMode.SUGGEST_MORE_POPULAR);
208 assertEquals(1, similar.length);
209 assertEquals("twenty-one", similar[0]);
213 String[] similar = spellChecker.suggestSimilar("eight", 5, r, "field1",
214 SuggestMode.SUGGEST_MORE_POPULAR);
215 assertEquals(0, similar.length);
219 String[] similar = spellChecker.suggestSimilar("eighty", 5, r, "field1",
220 SuggestMode.SUGGEST_ALWAYS);
221 assertEquals(5, similar.length);
222 assertEquals("eight", similar[0]);
226 String[] similar = spellChecker.suggestSimilar("eight", 5, r, "field1",
227 SuggestMode.SUGGEST_ALWAYS);
228 assertEquals(5, similar.length);
229 assertEquals("eighty", similar[0]);
233 private void checkCommonSuggestions(IndexReader r) throws IOException {
234 String[] similar = spellChecker.suggestSimilar("fvie", 2);
235 assertTrue(similar.length > 0);
236 assertEquals(similar[0], "five");
238 similar = spellChecker.suggestSimilar("five", 2);
239 if (similar.length > 0) {
240 assertFalse(similar[0].equals("five")); // don't suggest a word for itself
243 similar = spellChecker.suggestSimilar("fiv", 2);
244 assertTrue(similar.length > 0);
245 assertEquals(similar[0], "five");
247 similar = spellChecker.suggestSimilar("fives", 2);
248 assertTrue(similar.length > 0);
249 assertEquals(similar[0], "five");
251 assertTrue(similar.length > 0);
252 similar = spellChecker.suggestSimilar("fie", 2);
253 assertEquals(similar[0], "five");
255 // test restraint to a field
256 similar = spellChecker.suggestSimilar("tousand", 10, r, "field1",
257 SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
258 assertEquals(0, similar.length); // there isn't the term thousand in the field field1
260 similar = spellChecker.suggestSimilar("tousand", 10, r, "field2",
261 SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
262 assertEquals(1, similar.length); // there is the term thousand in the field field2
265 private void checkLevenshteinSuggestions(IndexReader r) throws IOException {
267 String[] similar = spellChecker.suggestSimilar("fvie", 2);
268 assertEquals(1, similar.length);
269 assertEquals(similar[0], "five");
271 similar = spellChecker.suggestSimilar("five", 2);
272 assertEquals(1, similar.length);
273 assertEquals(similar[0], "nine"); // don't suggest a word for itself
275 similar = spellChecker.suggestSimilar("fiv", 2);
276 assertEquals(1, similar.length);
277 assertEquals(similar[0], "five");
279 similar = spellChecker.suggestSimilar("ive", 2);
280 assertEquals(2, similar.length);
281 assertEquals(similar[0], "five");
282 assertEquals(similar[1], "nine");
284 similar = spellChecker.suggestSimilar("fives", 2);
285 assertEquals(1, similar.length);
286 assertEquals(similar[0], "five");
288 similar = spellChecker.suggestSimilar("fie", 2);
289 assertEquals(2, similar.length);
290 assertEquals(similar[0], "five");
291 assertEquals(similar[1], "nine");
293 similar = spellChecker.suggestSimilar("fi", 2);
294 assertEquals(1, similar.length);
295 assertEquals(similar[0], "five");
297 // test restraint to a field
298 similar = spellChecker.suggestSimilar("tousand", 10, r, "field1",
299 SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
300 assertEquals(0, similar.length); // there isn't the term thousand in the field field1
302 similar = spellChecker.suggestSimilar("tousand", 10, r, "field2",
303 SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
304 assertEquals(1, similar.length); // there is the term thousand in the field field2
306 similar = spellChecker.suggestSimilar("onety", 2);
307 assertEquals(2, similar.length);
308 assertEquals(similar[0], "ninety");
309 assertEquals(similar[1], "one");
311 similar = spellChecker.suggestSimilar("tousand", 10, r, null,
312 SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
313 } catch (NullPointerException e) {
314 assertTrue("threw an NPE, and it shouldn't have", false);
318 private void checkJaroWinklerSuggestions() throws IOException {
319 String[] similar = spellChecker.suggestSimilar("onety", 2);
320 assertEquals(2, similar.length);
321 assertEquals(similar[0], "one");
322 assertEquals(similar[1], "ninety");
325 private void checkNGramSuggestions() throws IOException {
326 String[] similar = spellChecker.suggestSimilar("onety", 2);
327 assertEquals(2, similar.length);
328 assertEquals(similar[0], "one");
329 assertEquals(similar[1], "ninety");
332 private void addwords(IndexReader r, SpellChecker sc, String field) throws IOException {
333 long time = System.currentTimeMillis();
334 sc.indexDictionary(new LuceneDictionary(r, field), newIndexWriterConfig(TEST_VERSION_CURRENT, null), false);
335 time = System.currentTimeMillis() - time;
336 //System.out.println("time to build " + field + ": " + time);
339 private int numdoc() throws IOException {
340 IndexReader rs = IndexReader.open(spellindex, true);
341 int num = rs.numDocs();
342 assertTrue(num != 0);
343 //System.out.println("num docs: " + num);
348 public void testClose() throws IOException {
349 IndexReader r = IndexReader.open(userindex, true);
350 spellChecker.clearIndex();
351 String field = "field1";
352 addwords(r, spellChecker, "field1");
353 int num_field1 = this.numdoc();
354 addwords(r, spellChecker, "field2");
355 int num_field2 = this.numdoc();
356 assertEquals(num_field2, num_field1 + 1);
357 checkCommonSuggestions(r);
358 assertLastSearcherOpen(4);
359 spellChecker.close();
360 assertSearchersClosed();
362 spellChecker.close();
363 fail("spellchecker was already closed");
364 } catch (AlreadyClosedException e) {
368 checkCommonSuggestions(r);
369 fail("spellchecker was already closed");
370 } catch (AlreadyClosedException e) {
375 spellChecker.clearIndex();
376 fail("spellchecker was already closed");
377 } catch (AlreadyClosedException e) {
382 spellChecker.indexDictionary(new LuceneDictionary(r, field), newIndexWriterConfig(TEST_VERSION_CURRENT, null), false);
383 fail("spellchecker was already closed");
384 } catch (AlreadyClosedException e) {
389 spellChecker.setSpellIndex(spellindex);
390 fail("spellchecker was already closed");
391 } catch (AlreadyClosedException e) {
394 assertEquals(4, searchers.size());
395 assertSearchersClosed();
400 * tests if the internally shared indexsearcher is correctly closed
401 * when the spellchecker is concurrently accessed and closed.
403 public void testConcurrentAccess() throws IOException, InterruptedException {
404 assertEquals(1, searchers.size());
405 final IndexReader r = IndexReader.open(userindex, true);
406 spellChecker.clearIndex();
407 assertEquals(2, searchers.size());
408 addwords(r, spellChecker, "field1");
409 assertEquals(3, searchers.size());
410 int num_field1 = this.numdoc();
411 addwords(r, spellChecker, "field2");
412 assertEquals(4, searchers.size());
413 int num_field2 = this.numdoc();
414 assertEquals(num_field2, num_field1 + 1);
415 int numThreads = 5 + this.random.nextInt(5);
416 ExecutorService executor = Executors.newFixedThreadPool(numThreads);
417 SpellCheckWorker[] workers = new SpellCheckWorker[numThreads];
418 for (int i = 0; i < numThreads; i++) {
419 SpellCheckWorker spellCheckWorker = new SpellCheckWorker(r);
420 executor.execute(spellCheckWorker);
421 workers[i] = spellCheckWorker;
424 int iterations = 5 + random.nextInt(5);
425 for (int i = 0; i < iterations; i++) {
427 // concurrently reset the spell index
428 spellChecker.setSpellIndex(this.spellindex);
429 // for debug - prints the internal open searchers
430 // showSearchersOpen();
433 spellChecker.close();
435 // wait for 60 seconds - usually this is very fast but coverage runs could take quite long
436 executor.awaitTermination(60L, TimeUnit.SECONDS);
438 for (int i = 0; i < workers.length; i++) {
439 assertFalse(String.format("worker thread %d failed", i), workers[i].failed);
440 assertTrue(String.format("worker thread %d is still running but should be terminated", i), workers[i].terminated);
442 // 4 searchers more than iterations
445 // 2. and 3. during addwords
446 assertEquals(iterations + 4, searchers.size());
447 assertSearchersClosed();
451 private void assertLastSearcherOpen(int numSearchers) {
452 assertEquals(numSearchers, searchers.size());
453 IndexSearcher[] searcherArray = searchers.toArray(new IndexSearcher[0]);
454 for (int i = 0; i < searcherArray.length; i++) {
455 if (i == searcherArray.length - 1) {
456 assertTrue("expected last searcher open but was closed",
457 searcherArray[i].getIndexReader().getRefCount() > 0);
459 assertFalse("expected closed searcher but was open - Index: " + i,
460 searcherArray[i].getIndexReader().getRefCount() > 0);
465 private void assertSearchersClosed() {
466 for (IndexSearcher searcher : searchers) {
467 assertEquals(0, searcher.getIndexReader().getRefCount());
472 // private void showSearchersOpen() {
474 // for (IndexSearcher searcher : searchers) {
475 // if(searcher.getIndexReader().getRefCount() > 0)
478 // System.out.println(count);
482 private class SpellCheckWorker implements Runnable {
483 private final IndexReader reader;
484 volatile boolean terminated = false;
485 volatile boolean failed = false;
487 SpellCheckWorker(IndexReader reader) {
489 this.reader = reader;
496 checkCommonSuggestions(reader);
497 } catch (AlreadyClosedException e) {
500 } catch (Throwable e) {
514 class SpellCheckerMock extends SpellChecker {
515 public SpellCheckerMock(Directory spellIndex) throws IOException {
519 public SpellCheckerMock(Directory spellIndex, StringDistance sd)
521 super(spellIndex, sd);
524 public SpellCheckerMock(Directory spellIndex, StringDistance sd, Comparator<SuggestWord> comparator) throws IOException {
525 super(spellIndex, sd, comparator);
529 IndexSearcher createSearcher(Directory dir) throws IOException {
530 IndexSearcher searcher = super.createSearcher(dir);
531 TestSpellChecker.this.searchers.add(searcher);