1 package org.apache.lucene.search.spell;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Collections;
23 import java.util.Comparator;
24 import java.util.List;
25 import java.util.concurrent.ExecutorService;
26 import java.util.concurrent.Executors;
27 import java.util.concurrent.TimeUnit;
29 import org.apache.lucene.analysis.MockAnalyzer;
30 import org.apache.lucene.document.Document;
31 import org.apache.lucene.document.Field;
32 import org.apache.lucene.index.CorruptIndexException;
33 import org.apache.lucene.index.IndexReader;
34 import org.apache.lucene.index.IndexWriter;
35 import org.apache.lucene.index.IndexWriterConfig;
36 import org.apache.lucene.search.IndexSearcher;
37 import org.apache.lucene.store.AlreadyClosedException;
38 import org.apache.lucene.store.Directory;
39 import org.apache.lucene.util.English;
40 import org.apache.lucene.util.LuceneTestCase;
43 * Spell checker test case
45 public class TestSpellChecker extends LuceneTestCase {
46 private SpellCheckerMock spellChecker;
47 private Directory userindex, spellindex;
48 private List<IndexSearcher> searchers;
51 public void setUp() throws Exception {
55 userindex = newDirectory();
56 IndexWriter writer = new IndexWriter(userindex, new IndexWriterConfig(
57 TEST_VERSION_CURRENT, new MockAnalyzer(random)));
59 for (int i = 0; i < 1000; i++) {
60 Document doc = new Document();
61 doc.add(newField("field1", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED));
62 doc.add(newField("field2", English.intToEnglish(i + 1), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand
63 doc.add(newField("field3", "fvei" + (i % 2 == 0 ? " five" : ""), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand
64 writer.addDocument(doc);
67 searchers = Collections.synchronizedList(new ArrayList<IndexSearcher>());
68 // create the spellChecker
69 spellindex = newDirectory();
70 spellChecker = new SpellCheckerMock(spellindex);
74 public void tearDown() throws Exception {
76 if (!spellChecker.isClosed())
83 public void testBuild() throws CorruptIndexException, IOException {
84 IndexReader r = IndexReader.open(userindex, true);
86 spellChecker.clearIndex();
88 addwords(r, spellChecker, "field1");
89 int num_field1 = this.numdoc();
91 addwords(r, spellChecker, "field2");
92 int num_field2 = this.numdoc();
94 assertEquals(num_field2, num_field1 + 1);
96 assertLastSearcherOpen(4);
98 checkCommonSuggestions(r);
99 checkLevenshteinSuggestions(r);
101 spellChecker.setStringDistance(new JaroWinklerDistance());
102 spellChecker.setAccuracy(0.8f);
103 checkCommonSuggestions(r);
104 checkJaroWinklerSuggestions();
105 // the accuracy is set to 0.8 by default, but the best result has a score of 0.925
106 String[] similar = spellChecker.suggestSimilar("fvie", 2, 0.93f);
107 assertTrue(similar.length == 0);
108 similar = spellChecker.suggestSimilar("fvie", 2, 0.92f);
109 assertTrue(similar.length == 1);
111 similar = spellChecker.suggestSimilar("fiv", 2);
112 assertTrue(similar.length > 0);
113 assertEquals(similar[0], "five");
115 spellChecker.setStringDistance(new NGramDistance(2));
116 spellChecker.setAccuracy(0.5f);
117 checkCommonSuggestions(r);
118 checkNGramSuggestions();
123 public void testComparator() throws Exception {
124 IndexReader r = IndexReader.open(userindex, true);
125 Directory compIdx = newDirectory();
126 SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
127 addwords(r, compareSP, "field3");
129 String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3", false);
130 assertEquals(2, similar.length);
131 //five and fvei have the same score, but different frequencies.
132 assertEquals("fvei", similar[0]);
133 assertEquals("five", similar[1]);
135 if (!compareSP.isClosed())
140 public void testBogusField() throws Exception {
141 IndexReader r = IndexReader.open(userindex, true);
142 Directory compIdx = newDirectory();
143 SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
144 addwords(r, compareSP, "field3");
146 String[] similar = compareSP.suggestSimilar("fvie", 2, r, "bogusFieldBogusField", false);
147 assertEquals(0, similar.length);
149 if (!compareSP.isClosed())
154 private void checkCommonSuggestions(IndexReader r) throws IOException {
155 String[] similar = spellChecker.suggestSimilar("fvie", 2);
156 assertTrue(similar.length > 0);
157 assertEquals(similar[0], "five");
159 similar = spellChecker.suggestSimilar("five", 2);
160 if (similar.length > 0) {
161 assertFalse(similar[0].equals("five")); // don't suggest a word for itself
164 similar = spellChecker.suggestSimilar("fiv", 2);
165 assertTrue(similar.length > 0);
166 assertEquals(similar[0], "five");
168 similar = spellChecker.suggestSimilar("fives", 2);
169 assertTrue(similar.length > 0);
170 assertEquals(similar[0], "five");
172 assertTrue(similar.length > 0);
173 similar = spellChecker.suggestSimilar("fie", 2);
174 assertEquals(similar[0], "five");
176 // test restraint to a field
177 similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
178 assertEquals(0, similar.length); // there isn't the term thousand in the field field1
180 similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
181 assertEquals(1, similar.length); // there is the term thousand in the field field2
184 private void checkLevenshteinSuggestions(IndexReader r) throws IOException {
186 String[] similar = spellChecker.suggestSimilar("fvie", 2);
187 assertEquals(1, similar.length);
188 assertEquals(similar[0], "five");
190 similar = spellChecker.suggestSimilar("five", 2);
191 assertEquals(1, similar.length);
192 assertEquals(similar[0], "nine"); // don't suggest a word for itself
194 similar = spellChecker.suggestSimilar("fiv", 2);
195 assertEquals(1, similar.length);
196 assertEquals(similar[0], "five");
198 similar = spellChecker.suggestSimilar("ive", 2);
199 assertEquals(2, similar.length);
200 assertEquals(similar[0], "five");
201 assertEquals(similar[1], "nine");
203 similar = spellChecker.suggestSimilar("fives", 2);
204 assertEquals(1, similar.length);
205 assertEquals(similar[0], "five");
207 similar = spellChecker.suggestSimilar("fie", 2);
208 assertEquals(2, similar.length);
209 assertEquals(similar[0], "five");
210 assertEquals(similar[1], "nine");
212 similar = spellChecker.suggestSimilar("fi", 2);
213 assertEquals(1, similar.length);
214 assertEquals(similar[0], "five");
216 // test restraint to a field
217 similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
218 assertEquals(0, similar.length); // there isn't the term thousand in the field field1
220 similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
221 assertEquals(1, similar.length); // there is the term thousand in the field field2
223 similar = spellChecker.suggestSimilar("onety", 2);
224 assertEquals(2, similar.length);
225 assertEquals(similar[0], "ninety");
226 assertEquals(similar[1], "one");
228 similar = spellChecker.suggestSimilar("tousand", 10, r, null, false);
229 } catch (NullPointerException e) {
230 assertTrue("threw an NPE, and it shouldn't have", false);
234 private void checkJaroWinklerSuggestions() throws IOException {
235 String[] similar = spellChecker.suggestSimilar("onety", 2);
236 assertEquals(2, similar.length);
237 assertEquals(similar[0], "one");
238 assertEquals(similar[1], "ninety");
241 private void checkNGramSuggestions() throws IOException {
242 String[] similar = spellChecker.suggestSimilar("onety", 2);
243 assertEquals(2, similar.length);
244 assertEquals(similar[0], "one");
245 assertEquals(similar[1], "ninety");
248 private void addwords(IndexReader r, SpellChecker sc, String field) throws IOException {
249 long time = System.currentTimeMillis();
250 sc.indexDictionary(new LuceneDictionary(r, field));
251 time = System.currentTimeMillis() - time;
252 //System.out.println("time to build " + field + ": " + time);
255 private int numdoc() throws IOException {
256 IndexReader rs = IndexReader.open(spellindex, true);
257 int num = rs.numDocs();
258 assertTrue(num != 0);
259 //System.out.println("num docs: " + num);
264 public void testClose() throws IOException {
265 IndexReader r = IndexReader.open(userindex, true);
266 spellChecker.clearIndex();
267 String field = "field1";
268 addwords(r, spellChecker, "field1");
269 int num_field1 = this.numdoc();
270 addwords(r, spellChecker, "field2");
271 int num_field2 = this.numdoc();
272 assertEquals(num_field2, num_field1 + 1);
273 checkCommonSuggestions(r);
274 assertLastSearcherOpen(4);
275 spellChecker.close();
276 assertSearchersClosed();
278 spellChecker.close();
279 fail("spellchecker was already closed");
280 } catch (AlreadyClosedException e) {
284 checkCommonSuggestions(r);
285 fail("spellchecker was already closed");
286 } catch (AlreadyClosedException e) {
291 spellChecker.clearIndex();
292 fail("spellchecker was already closed");
293 } catch (AlreadyClosedException e) {
298 spellChecker.indexDictionary(new LuceneDictionary(r, field));
299 fail("spellchecker was already closed");
300 } catch (AlreadyClosedException e) {
305 spellChecker.setSpellIndex(spellindex);
306 fail("spellchecker was already closed");
307 } catch (AlreadyClosedException e) {
310 assertEquals(4, searchers.size());
311 assertSearchersClosed();
316 * tests if the internally shared indexsearcher is correctly closed
317 * when the spellchecker is concurrently accessed and closed.
319 public void testConcurrentAccess() throws IOException, InterruptedException {
320 assertEquals(1, searchers.size());
321 final IndexReader r = IndexReader.open(userindex, true);
322 spellChecker.clearIndex();
323 assertEquals(2, searchers.size());
324 addwords(r, spellChecker, "field1");
325 assertEquals(3, searchers.size());
326 int num_field1 = this.numdoc();
327 addwords(r, spellChecker, "field2");
328 assertEquals(4, searchers.size());
329 int num_field2 = this.numdoc();
330 assertEquals(num_field2, num_field1 + 1);
331 int numThreads = 5 + this.random.nextInt(5);
332 ExecutorService executor = Executors.newFixedThreadPool(numThreads);
333 SpellCheckWorker[] workers = new SpellCheckWorker[numThreads];
334 for (int i = 0; i < numThreads; i++) {
335 SpellCheckWorker spellCheckWorker = new SpellCheckWorker(r);
336 executor.execute(spellCheckWorker);
337 workers[i] = spellCheckWorker;
340 int iterations = 5 + random.nextInt(5);
341 for (int i = 0; i < iterations; i++) {
343 // concurrently reset the spell index
344 spellChecker.setSpellIndex(this.spellindex);
345 // for debug - prints the internal open searchers
346 // showSearchersOpen();
349 spellChecker.close();
351 // wait for 60 seconds - usually this is very fast but coverage runs could take quite long
352 executor.awaitTermination(60L, TimeUnit.SECONDS);
354 for (int i = 0; i < workers.length; i++) {
355 assertFalse(String.format("worker thread %d failed", i), workers[i].failed);
356 assertTrue(String.format("worker thread %d is still running but should be terminated", i), workers[i].terminated);
358 // 4 searchers more than iterations
361 // 2. and 3. during addwords
362 assertEquals(iterations + 4, searchers.size());
363 assertSearchersClosed();
367 private void assertLastSearcherOpen(int numSearchers) {
368 assertEquals(numSearchers, searchers.size());
369 IndexSearcher[] searcherArray = searchers.toArray(new IndexSearcher[0]);
370 for (int i = 0; i < searcherArray.length; i++) {
371 if (i == searcherArray.length - 1) {
372 assertTrue("expected last searcher open but was closed",
373 searcherArray[i].getIndexReader().getRefCount() > 0);
375 assertFalse("expected closed searcher but was open - Index: " + i,
376 searcherArray[i].getIndexReader().getRefCount() > 0);
381 private void assertSearchersClosed() {
382 for (IndexSearcher searcher : searchers) {
383 assertEquals(0, searcher.getIndexReader().getRefCount());
388 // private void showSearchersOpen() {
390 // for (IndexSearcher searcher : searchers) {
391 // if(searcher.getIndexReader().getRefCount() > 0)
394 // System.out.println(count);
398 private class SpellCheckWorker implements Runnable {
399 private final IndexReader reader;
400 volatile boolean terminated = false;
401 volatile boolean failed = false;
403 SpellCheckWorker(IndexReader reader) {
405 this.reader = reader;
412 checkCommonSuggestions(reader);
413 } catch (AlreadyClosedException e) {
416 } catch (Throwable e) {
430 class SpellCheckerMock extends SpellChecker {
431 public SpellCheckerMock(Directory spellIndex) throws IOException {
435 public SpellCheckerMock(Directory spellIndex, StringDistance sd)
437 super(spellIndex, sd);
440 public SpellCheckerMock(Directory spellIndex, StringDistance sd, Comparator<SuggestWord> comparator) throws IOException {
441 super(spellIndex, sd, comparator);
445 IndexSearcher createSearcher(Directory dir) throws IOException {
446 IndexSearcher searcher = super.createSearcher(dir);
447 TestSpellChecker.this.searchers.add(searcher);