X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java diff --git a/lucene-java-3.5.0/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java b/lucene-java-3.5.0/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java new file mode 100755 index 0000000..0e82304 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java @@ -0,0 +1,536 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.English; +import org.apache.lucene.util.LuceneTestCase; + +/** + * Spell checker test case + */ +public class TestSpellChecker extends LuceneTestCase { + private SpellCheckerMock spellChecker; + private Directory userindex, spellindex; + private List searchers; + + @Override + public void setUp() throws Exception { + super.setUp(); + + //create a user index + userindex = newDirectory(); + IndexWriter writer = new IndexWriter(userindex, new IndexWriterConfig( + TEST_VERSION_CURRENT, new MockAnalyzer(random))); + + for (int i = 0; i < 1000; i++) { + Document doc = new Document(); + doc.add(newField("field1", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED)); + doc.add(newField("field2", English.intToEnglish(i + 1), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand + doc.add(newField("field3", "fvei" + (i % 2 == 0 ? " five" : ""), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand + writer.addDocument(doc); + } + { + Document doc = new Document(); + doc.add(newField("field1", "eight", Field.Index.ANALYZED)); // "eight" in + // the index + // twice + writer.addDocument(doc); + } + { + Document doc = new Document(); + doc + .add(newField("field1", "twenty-one twenty-one", + Field.Index.ANALYZED)); // "twenty-one" in the index thrice + writer.addDocument(doc); + } + { + Document doc = new Document(); + doc.add(newField("field1", "twenty", Field.Index.ANALYZED)); // "twenty" + // in the + // index + // twice + writer.addDocument(doc); + } + + writer.close(); + searchers = Collections.synchronizedList(new ArrayList()); + // create the spellChecker + spellindex = newDirectory(); + spellChecker = new SpellCheckerMock(spellindex); + } + + @Override + public void tearDown() throws Exception { + userindex.close(); + if (!spellChecker.isClosed()) + spellChecker.close(); + spellindex.close(); + super.tearDown(); + } + + + public void testBuild() throws CorruptIndexException, IOException { + IndexReader r = IndexReader.open(userindex, true); + + spellChecker.clearIndex(); + + addwords(r, spellChecker, "field1"); + int num_field1 = this.numdoc(); + + addwords(r, spellChecker, "field2"); + int num_field2 = this.numdoc(); + + assertEquals(num_field2, num_field1 + 1); + + assertLastSearcherOpen(4); + + checkCommonSuggestions(r); + checkLevenshteinSuggestions(r); + + spellChecker.setStringDistance(new JaroWinklerDistance()); + spellChecker.setAccuracy(0.8f); + checkCommonSuggestions(r); + checkJaroWinklerSuggestions(); + // the accuracy is set to 0.8 by default, but the best result has a score of 0.925 + String[] similar = spellChecker.suggestSimilar("fvie", 2, 0.93f); + assertTrue(similar.length == 0); + similar = spellChecker.suggestSimilar("fvie", 2, 0.92f); + assertTrue(similar.length == 1); + + similar = spellChecker.suggestSimilar("fiv", 2); + assertTrue(similar.length > 0); + assertEquals(similar[0], "five"); + + spellChecker.setStringDistance(new NGramDistance(2)); + spellChecker.setAccuracy(0.5f); + checkCommonSuggestions(r); + checkNGramSuggestions(); + + r.close(); + } + + public void testComparator() throws Exception { + IndexReader r = IndexReader.open(userindex, true); + Directory compIdx = newDirectory(); + SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator()); + addwords(r, compareSP, "field3"); + + String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3", + SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); + assertEquals(2, similar.length); + //five and fvei have the same score, but different frequencies. + assertEquals("fvei", similar[0]); + assertEquals("five", similar[1]); + r.close(); + if (!compareSP.isClosed()) + compareSP.close(); + compIdx.close(); + } + + public void testBogusField() throws Exception { + IndexReader r = IndexReader.open(userindex, true); + Directory compIdx = newDirectory(); + SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator()); + addwords(r, compareSP, "field3"); + + String[] similar = compareSP.suggestSimilar("fvie", 2, r, + "bogusFieldBogusField", SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); + assertEquals(0, similar.length); + r.close(); + if (!compareSP.isClosed()) + compareSP.close(); + compIdx.close(); + } + + public void testSuggestModes() throws Exception { + IndexReader r = IndexReader.open(userindex, true); + spellChecker.clearIndex(); + addwords(r, spellChecker, "field1"); + + { + String[] similar = spellChecker.suggestSimilar("eighty", 2, r, "field1", + SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); + assertEquals(1, similar.length); + assertEquals("eighty", similar[0]); + } + + { + String[] similar = spellChecker.suggestSimilar("eight", 2, r, "field1", + SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); + assertEquals(1, similar.length); + assertEquals("eight", similar[0]); + } + + { + String[] similar = spellChecker.suggestSimilar("eighty", 5, r, "field1", + SuggestMode.SUGGEST_MORE_POPULAR); + assertEquals(5, similar.length); + assertEquals("eight", similar[0]); + } + + { + String[] similar = spellChecker.suggestSimilar("twenty", 5, r, "field1", + SuggestMode.SUGGEST_MORE_POPULAR); + assertEquals(1, similar.length); + assertEquals("twenty-one", similar[0]); + } + + { + String[] similar = spellChecker.suggestSimilar("eight", 5, r, "field1", + SuggestMode.SUGGEST_MORE_POPULAR); + assertEquals(0, similar.length); + } + + { + String[] similar = spellChecker.suggestSimilar("eighty", 5, r, "field1", + SuggestMode.SUGGEST_ALWAYS); + assertEquals(5, similar.length); + assertEquals("eight", similar[0]); + } + + { + String[] similar = spellChecker.suggestSimilar("eight", 5, r, "field1", + SuggestMode.SUGGEST_ALWAYS); + assertEquals(5, similar.length); + assertEquals("eighty", similar[0]); + } + r.close(); + } + private void checkCommonSuggestions(IndexReader r) throws IOException { + String[] similar = spellChecker.suggestSimilar("fvie", 2); + assertTrue(similar.length > 0); + assertEquals(similar[0], "five"); + + similar = spellChecker.suggestSimilar("five", 2); + if (similar.length > 0) { + assertFalse(similar[0].equals("five")); // don't suggest a word for itself + } + + similar = spellChecker.suggestSimilar("fiv", 2); + assertTrue(similar.length > 0); + assertEquals(similar[0], "five"); + + similar = spellChecker.suggestSimilar("fives", 2); + assertTrue(similar.length > 0); + assertEquals(similar[0], "five"); + + assertTrue(similar.length > 0); + similar = spellChecker.suggestSimilar("fie", 2); + assertEquals(similar[0], "five"); + + // test restraint to a field + similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", + SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); + assertEquals(0, similar.length); // there isn't the term thousand in the field field1 + + similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", + SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); + assertEquals(1, similar.length); // there is the term thousand in the field field2 + } + + private void checkLevenshteinSuggestions(IndexReader r) throws IOException { + // test small word + String[] similar = spellChecker.suggestSimilar("fvie", 2); + assertEquals(1, similar.length); + assertEquals(similar[0], "five"); + + similar = spellChecker.suggestSimilar("five", 2); + assertEquals(1, similar.length); + assertEquals(similar[0], "nine"); // don't suggest a word for itself + + similar = spellChecker.suggestSimilar("fiv", 2); + assertEquals(1, similar.length); + assertEquals(similar[0], "five"); + + similar = spellChecker.suggestSimilar("ive", 2); + assertEquals(2, similar.length); + assertEquals(similar[0], "five"); + assertEquals(similar[1], "nine"); + + similar = spellChecker.suggestSimilar("fives", 2); + assertEquals(1, similar.length); + assertEquals(similar[0], "five"); + + similar = spellChecker.suggestSimilar("fie", 2); + assertEquals(2, similar.length); + assertEquals(similar[0], "five"); + assertEquals(similar[1], "nine"); + + similar = spellChecker.suggestSimilar("fi", 2); + assertEquals(1, similar.length); + assertEquals(similar[0], "five"); + + // test restraint to a field + similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", + SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); + assertEquals(0, similar.length); // there isn't the term thousand in the field field1 + + similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", + SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); + assertEquals(1, similar.length); // there is the term thousand in the field field2 + + similar = spellChecker.suggestSimilar("onety", 2); + assertEquals(2, similar.length); + assertEquals(similar[0], "ninety"); + assertEquals(similar[1], "one"); + try { + similar = spellChecker.suggestSimilar("tousand", 10, r, null, + SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); + } catch (NullPointerException e) { + assertTrue("threw an NPE, and it shouldn't have", false); + } + } + + private void checkJaroWinklerSuggestions() throws IOException { + String[] similar = spellChecker.suggestSimilar("onety", 2); + assertEquals(2, similar.length); + assertEquals(similar[0], "one"); + assertEquals(similar[1], "ninety"); + } + + private void checkNGramSuggestions() throws IOException { + String[] similar = spellChecker.suggestSimilar("onety", 2); + assertEquals(2, similar.length); + assertEquals(similar[0], "one"); + assertEquals(similar[1], "ninety"); + } + + private void addwords(IndexReader r, SpellChecker sc, String field) throws IOException { + long time = System.currentTimeMillis(); + sc.indexDictionary(new LuceneDictionary(r, field), newIndexWriterConfig(TEST_VERSION_CURRENT, null), false); + time = System.currentTimeMillis() - time; + //System.out.println("time to build " + field + ": " + time); + } + + private int numdoc() throws IOException { + IndexReader rs = IndexReader.open(spellindex, true); + int num = rs.numDocs(); + assertTrue(num != 0); + //System.out.println("num docs: " + num); + rs.close(); + return num; + } + + public void testClose() throws IOException { + IndexReader r = IndexReader.open(userindex, true); + spellChecker.clearIndex(); + String field = "field1"; + addwords(r, spellChecker, "field1"); + int num_field1 = this.numdoc(); + addwords(r, spellChecker, "field2"); + int num_field2 = this.numdoc(); + assertEquals(num_field2, num_field1 + 1); + checkCommonSuggestions(r); + assertLastSearcherOpen(4); + spellChecker.close(); + assertSearchersClosed(); + try { + spellChecker.close(); + fail("spellchecker was already closed"); + } catch (AlreadyClosedException e) { + // expected + } + try { + checkCommonSuggestions(r); + fail("spellchecker was already closed"); + } catch (AlreadyClosedException e) { + // expected + } + + try { + spellChecker.clearIndex(); + fail("spellchecker was already closed"); + } catch (AlreadyClosedException e) { + // expected + } + + try { + spellChecker.indexDictionary(new LuceneDictionary(r, field), newIndexWriterConfig(TEST_VERSION_CURRENT, null), false); + fail("spellchecker was already closed"); + } catch (AlreadyClosedException e) { + // expected + } + + try { + spellChecker.setSpellIndex(spellindex); + fail("spellchecker was already closed"); + } catch (AlreadyClosedException e) { + // expected + } + assertEquals(4, searchers.size()); + assertSearchersClosed(); + r.close(); + } + + /* + * tests if the internally shared indexsearcher is correctly closed + * when the spellchecker is concurrently accessed and closed. + */ + public void testConcurrentAccess() throws IOException, InterruptedException { + assertEquals(1, searchers.size()); + final IndexReader r = IndexReader.open(userindex, true); + spellChecker.clearIndex(); + assertEquals(2, searchers.size()); + addwords(r, spellChecker, "field1"); + assertEquals(3, searchers.size()); + int num_field1 = this.numdoc(); + addwords(r, spellChecker, "field2"); + assertEquals(4, searchers.size()); + int num_field2 = this.numdoc(); + assertEquals(num_field2, num_field1 + 1); + int numThreads = 5 + this.random.nextInt(5); + ExecutorService executor = Executors.newFixedThreadPool(numThreads); + SpellCheckWorker[] workers = new SpellCheckWorker[numThreads]; + for (int i = 0; i < numThreads; i++) { + SpellCheckWorker spellCheckWorker = new SpellCheckWorker(r); + executor.execute(spellCheckWorker); + workers[i] = spellCheckWorker; + + } + int iterations = 5 + random.nextInt(5); + for (int i = 0; i < iterations; i++) { + Thread.sleep(100); + // concurrently reset the spell index + spellChecker.setSpellIndex(this.spellindex); + // for debug - prints the internal open searchers + // showSearchersOpen(); + } + + spellChecker.close(); + executor.shutdown(); + // wait for 60 seconds - usually this is very fast but coverage runs could take quite long + executor.awaitTermination(60L, TimeUnit.SECONDS); + + for (int i = 0; i < workers.length; i++) { + assertFalse(String.format("worker thread %d failed", i), workers[i].failed); + assertTrue(String.format("worker thread %d is still running but should be terminated", i), workers[i].terminated); + } + // 4 searchers more than iterations + // 1. at creation + // 2. clearIndex() + // 2. and 3. during addwords + assertEquals(iterations + 4, searchers.size()); + assertSearchersClosed(); + r.close(); + } + + private void assertLastSearcherOpen(int numSearchers) { + assertEquals(numSearchers, searchers.size()); + IndexSearcher[] searcherArray = searchers.toArray(new IndexSearcher[0]); + for (int i = 0; i < searcherArray.length; i++) { + if (i == searcherArray.length - 1) { + assertTrue("expected last searcher open but was closed", + searcherArray[i].getIndexReader().getRefCount() > 0); + } else { + assertFalse("expected closed searcher but was open - Index: " + i, + searcherArray[i].getIndexReader().getRefCount() > 0); + } + } + } + + private void assertSearchersClosed() { + for (IndexSearcher searcher : searchers) { + assertEquals(0, searcher.getIndexReader().getRefCount()); + } + } + + // For debug +// private void showSearchersOpen() { +// int count = 0; +// for (IndexSearcher searcher : searchers) { +// if(searcher.getIndexReader().getRefCount() > 0) +// ++count; +// } +// System.out.println(count); +// } + + + private class SpellCheckWorker implements Runnable { + private final IndexReader reader; + volatile boolean terminated = false; + volatile boolean failed = false; + + SpellCheckWorker(IndexReader reader) { + super(); + this.reader = reader; + } + + public void run() { + try { + while (true) { + try { + checkCommonSuggestions(reader); + } catch (AlreadyClosedException e) { + + return; + } catch (Throwable e) { + + e.printStackTrace(); + failed = true; + return; + } + } + } finally { + terminated = true; + } + } + + } + + class SpellCheckerMock extends SpellChecker { + public SpellCheckerMock(Directory spellIndex) throws IOException { + super(spellIndex); + } + + public SpellCheckerMock(Directory spellIndex, StringDistance sd) + throws IOException { + super(spellIndex, sd); + } + + public SpellCheckerMock(Directory spellIndex, StringDistance sd, Comparator comparator) throws IOException { + super(spellIndex, sd, comparator); + } + + @Override + IndexSearcher createSearcher(Directory dir) throws IOException { + IndexSearcher searcher = super.createSearcher(dir); + TestSpellChecker.this.searchers.add(searcher); + return searcher; + } + } + +}