1 package org.apache.lucene.search.spell;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Comparator;
23 import java.util.Iterator;
24 import java.util.List;
26 import org.apache.lucene.analysis.WhitespaceAnalyzer;
27 import org.apache.lucene.document.Document;
28 import org.apache.lucene.document.Field;
29 import org.apache.lucene.index.FieldInfo.IndexOptions;
30 import org.apache.lucene.index.IndexReader;
31 import org.apache.lucene.index.IndexWriter;
32 import org.apache.lucene.index.IndexWriterConfig;
33 import org.apache.lucene.index.TieredMergePolicy;
34 import org.apache.lucene.index.Term;
35 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
36 import org.apache.lucene.search.BooleanClause;
37 import org.apache.lucene.search.BooleanQuery;
38 import org.apache.lucene.search.IndexSearcher;
39 import org.apache.lucene.search.Query;
40 import org.apache.lucene.search.ScoreDoc;
41 import org.apache.lucene.search.TermQuery;
42 import org.apache.lucene.store.AlreadyClosedException;
43 import org.apache.lucene.store.Directory;
44 import org.apache.lucene.util.ReaderUtil;
45 import org.apache.lucene.util.Version;
49 * Spell Checker class (Main class) <br/>
50 * (initially inspired by the David Spencer code).
56 * SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
57 * // To index a field of a user index:
58 * spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
59 * // To index a file containing words:
60 * spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
61 * String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
67 public class SpellChecker implements java.io.Closeable {
70 * The default minimum score to use, if not specified by calling {@link #setAccuracy(float)} .
72 public static final float DEFAULT_ACCURACY = 0.5f;
75 * Field name for each word in the ngram index.
77 public static final String F_WORD = "word";
79 private static final Term F_WORD_TERM = new Term(F_WORD);
84 // don't modify the directory directly - see #swapSearcher()
85 // TODO: why is this package private?
88 * Boost value for start and end grams
90 private float bStart = 2.0f;
92 private float bEnd = 1.0f;
93 // don't use this searcher directly - see #swapSearcher()
95 private IndexSearcher searcher;
97 * this locks all modifications to the current searcher.
100 private final Object searcherLock = new Object();
102 * this lock synchronizes all possible modifications to the
103 * current index directory. It should not be possible to try modifying
104 * the same index concurrently. Note: Do not acquire the searcher lock
105 * before acquiring this lock!
107 private final Object modifyCurrentIndexLock = new Object();
109 private volatile boolean closed = false;
110 // minimum score for hits generated by the spell checker query
112 private float accuracy = DEFAULT_ACCURACY;
114 private StringDistance sd;
115 private Comparator<SuggestWord> comparator;
118 * Use the given directory as a spell checker index. The directory
119 * is created if it doesn't exist yet.
120 * @param spellIndex the spell index directory
121 * @param sd the {@link StringDistance} measurement to use
122 * @throws IOException if Spellchecker can not open the directory
124 public SpellChecker(Directory spellIndex, StringDistance sd) throws IOException {
125 this(spellIndex, sd, SuggestWordQueue.DEFAULT_COMPARATOR);
128 * Use the given directory as a spell checker index with a
129 * {@link LevensteinDistance} as the default {@link StringDistance}. The
130 * directory is created if it doesn't exist yet.
133 * the spell index directory
134 * @throws IOException
135 * if spellchecker can not open the directory
137 public SpellChecker(Directory spellIndex) throws IOException {
138 this(spellIndex, new LevensteinDistance());
142 * Use the given directory as a spell checker index with the given {@link org.apache.lucene.search.spell.StringDistance} measure
143 * and the given {@link java.util.Comparator} for sorting the results.
144 * @param spellIndex The spelling index
145 * @param sd The distance
146 * @param comparator The comparator
147 * @throws IOException if there is a problem opening the index
149 public SpellChecker(Directory spellIndex, StringDistance sd, Comparator<SuggestWord> comparator) throws IOException {
150 setSpellIndex(spellIndex);
151 setStringDistance(sd);
152 this.comparator = comparator;
156 * Use a different index as the spell checker index or re-open
157 * the existing index if <code>spellIndex</code> is the same value
158 * as given in the constructor.
159 * @param spellIndexDir the spell directory to use
160 * @throws AlreadyClosedException if the Spellchecker is already closed
161 * @throws IOException if spellchecker can not open the directory
163 // TODO: we should make this final as it is called in the constructor
164 public void setSpellIndex(Directory spellIndexDir) throws IOException {
165 // this could be the same directory as the current spellIndex
166 // modifications to the directory should be synchronized
167 synchronized (modifyCurrentIndexLock) {
169 if (!IndexReader.indexExists(spellIndexDir)) {
170 IndexWriter writer = new IndexWriter(spellIndexDir,
171 new IndexWriterConfig(Version.LUCENE_CURRENT,
172 new WhitespaceAnalyzer(Version.LUCENE_CURRENT)));
175 swapSearcher(spellIndexDir);
180 * Sets the {@link java.util.Comparator} for the {@link SuggestWordQueue}.
181 * @param comparator the comparator
183 public void setComparator(Comparator<SuggestWord> comparator) {
184 this.comparator = comparator;
187 public Comparator<SuggestWord> getComparator() {
192 * Sets the {@link StringDistance} implementation for this
193 * {@link SpellChecker} instance.
195 * @param sd the {@link StringDistance} implementation for this
196 * {@link SpellChecker} instance
198 public void setStringDistance(StringDistance sd) {
202 * Returns the {@link StringDistance} instance used by this
203 * {@link SpellChecker} instance.
205 * @return the {@link StringDistance} instance used by this
206 * {@link SpellChecker} instance.
208 public StringDistance getStringDistance() {
213 * Sets the accuracy 0 < minScore < 1; default {@link #DEFAULT_ACCURACY}
214 * @param acc The new accuracy
216 public void setAccuracy(float acc) {
221 * The accuracy (minimum score) to be used, unless overridden in {@link #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)}, to
222 * decide whether a suggestion is included or not.
223 * @return The current accuracy setting
225 public float getAccuracy() {
230 * Suggest similar words.
232 * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
233 * is not the same as the edit distance strategy used to calculate the best
234 * matching spell-checked word from the hits that Lucene found, one usually has
235 * to retrieve a couple of numSug's in order to get the true best match.
237 * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
238 * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
240 * @param word the word you want a spell check done on
241 * @param numSug the number of suggested words
242 * @throws IOException if the underlying index throws an {@link IOException}
243 * @throws AlreadyClosedException if the Spellchecker is already closed
246 * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
248 public String[] suggestSimilar(String word, int numSug) throws IOException {
249 return this.suggestSimilar(word, numSug, null, null, false);
253 * Suggest similar words.
255 * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
256 * is not the same as the edit distance strategy used to calculate the best
257 * matching spell-checked word from the hits that Lucene found, one usually has
258 * to retrieve a couple of numSug's in order to get the true best match.
260 * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
261 * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
263 * @param word the word you want a spell check done on
264 * @param numSug the number of suggested words
265 * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
266 * @throws IOException if the underlying index throws an {@link IOException}
267 * @throws AlreadyClosedException if the Spellchecker is already closed
270 * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
272 public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException {
273 return this.suggestSimilar(word, numSug, null, null, false, accuracy);
277 * Suggest similar words (optionally restricted to a field of an index).
279 * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
280 * is not the same as the edit distance strategy used to calculate the best
281 * matching spell-checked word from the hits that Lucene found, one usually has
282 * to retrieve a couple of numSug's in order to get the true best match.
284 * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
285 * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
287 * <p>Uses the {@link #getAccuracy()} value passed into the constructor as the accuracy.
289 * @param word the word you want a spell check done on
290 * @param numSug the number of suggested words
291 * @param ir the indexReader of the user index (can be null see field param)
292 * @param field the field of the user index: if field is not null, the suggested
293 * words are restricted to the words present in this field.
294 * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word
295 * (only if restricted mode = (indexReader!=null and field!=null)
296 * @throws IOException if the underlying index throws an {@link IOException}
297 * @throws AlreadyClosedException if the Spellchecker is already closed
298 * @return String[] the sorted list of the suggest words with these 2 criteria:
299 * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
300 * of the suggest words in the field of the user index
302 * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
304 public String[] suggestSimilar(String word, int numSug, IndexReader ir,
305 String field, boolean morePopular) throws IOException {
306 return suggestSimilar(word, numSug, ir, field, morePopular, accuracy);
311 * Suggest similar words (optionally restricted to a field of an index).
313 * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
314 * is not the same as the edit distance strategy used to calculate the best
315 * matching spell-checked word from the hits that Lucene found, one usually has
316 * to retrieve a couple of numSug's in order to get the true best match.
318 * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
319 * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
321 * @param word the word you want a spell check done on
322 * @param numSug the number of suggested words
323 * @param ir the indexReader of the user index (can be null see field param)
324 * @param field the field of the user index: if field is not null, the suggested
325 * words are restricted to the words present in this field.
326 * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word
327 * (only if restricted mode = (indexReader!=null and field!=null)
328 * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
329 * @throws IOException if the underlying index throws an {@link IOException}
330 * @throws AlreadyClosedException if the Spellchecker is already closed
331 * @return String[] the sorted list of the suggest words with these 2 criteria:
332 * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
333 * of the suggest words in the field of the user index
335 public String[] suggestSimilar(String word, int numSug, IndexReader ir,
336 String field, boolean morePopular, float accuracy) throws IOException {
337 // obtainSearcher calls ensureOpen
338 final IndexSearcher indexSearcher = obtainSearcher();
341 final int lengthWord = word.length();
343 final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
344 final int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;
345 // if the word exists in the real index and we don't care for word frequency, return the word itself
346 if (!morePopular && freq > 0) {
347 return new String[] { word };
350 BooleanQuery query = new BooleanQuery();
354 for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {
356 key = "gram" + ng; // form key
358 grams = formGrams(word, ng); // form word into ngrams (allow dups too)
360 if (grams.length == 0) {
364 if (bStart > 0) { // should we boost prefixes?
365 add(query, "start" + ng, grams[0], bStart); // matches start of word
368 if (bEnd > 0) { // should we boost suffixes
369 add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word
372 for (int i = 0; i < grams.length; i++) {
373 add(query, key, grams[i]);
377 int maxHits = 10 * numSug;
379 // System.out.println("Q: " + query);
380 ScoreDoc[] hits = indexSearcher.search(query, null, maxHits).scoreDocs;
381 // System.out.println("HITS: " + hits.length());
382 SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator);
384 // go thru more than 'maxr' matches in case the distance filter triggers
385 int stop = Math.min(hits.length, maxHits);
386 SuggestWord sugWord = new SuggestWord();
387 for (int i = 0; i < stop; i++) {
389 sugWord.string = indexSearcher.doc(hits[i].doc).get(F_WORD); // get orig word
391 // don't suggest a word for itself, that would be silly
392 if (sugWord.string.equals(word)) {
397 sugWord.score = sd.getDistance(word,sugWord.string);
398 if (sugWord.score < accuracy) {
402 if (ir != null && field != null) { // use the user index
403 sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
404 // don't suggest a word that is not present in the field
405 if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) {
409 sugQueue.insertWithOverflow(sugWord);
410 if (sugQueue.size() == numSug) {
411 // if queue full, maintain the minScore score
412 accuracy = sugQueue.top().score;
414 sugWord = new SuggestWord();
417 // convert to array string
418 String[] list = new String[sugQueue.size()];
419 for (int i = sugQueue.size() - 1; i >= 0; i--) {
420 list[i] = sugQueue.pop().string;
425 releaseSearcher(indexSearcher);
429 * Add a clause to a boolean query.
431 private static void add(BooleanQuery q, String name, String value, float boost) {
432 Query tq = new TermQuery(new Term(name, value));
434 q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
438 * Add a clause to a boolean query.
440 private static void add(BooleanQuery q, String name, String value) {
441 q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
445 * Form all ngrams for a given word.
446 * @param text the word to parse
447 * @param ng the ngram length e.g. 3
448 * @return an array of all ngrams in the word and note that duplicates are not removed
450 private static String[] formGrams(String text, int ng) {
451 int len = text.length();
452 String[] res = new String[len - ng + 1];
453 for (int i = 0; i < len - ng + 1; i++) {
454 res[i] = text.substring(i, i + ng);
460 * Removes all terms from the spell check index.
461 * @throws IOException
462 * @throws AlreadyClosedException if the Spellchecker is already closed
464 public void clearIndex() throws IOException {
465 synchronized (modifyCurrentIndexLock) {
467 final Directory dir = this.spellIndex;
468 final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
469 Version.LUCENE_CURRENT,
470 new WhitespaceAnalyzer(Version.LUCENE_CURRENT))
471 .setOpenMode(OpenMode.CREATE));
478 * Check whether the word exists in the index.
480 * @throws IOException
481 * @throws AlreadyClosedException if the Spellchecker is already closed
482 * @return true if the word exists in the index
484 public boolean exist(String word) throws IOException {
485 // obtainSearcher calls ensureOpen
486 final IndexSearcher indexSearcher = obtainSearcher();
488 return indexSearcher.docFreq(F_WORD_TERM.createTerm(word)) > 0;
490 releaseSearcher(indexSearcher);
495 * Indexes the data from the given {@link Dictionary}.
496 * @param dict Dictionary to index
497 * @param mergeFactor mergeFactor to use when indexing
498 * @param ramMB the max amount or memory in MB to use
499 * @param optimize whether or not the spellcheck index should be optimized
500 * @throws AlreadyClosedException if the Spellchecker is already closed
501 * @throws IOException
503 public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB, boolean optimize) throws IOException {
504 synchronized (modifyCurrentIndexLock) {
506 final Directory dir = this.spellIndex;
507 final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, new WhitespaceAnalyzer(Version.LUCENE_CURRENT)).setRAMBufferSizeMB(ramMB));
508 ((TieredMergePolicy) writer.getConfig().getMergePolicy()).setMaxMergeAtOnce(mergeFactor);
509 IndexSearcher indexSearcher = obtainSearcher();
510 final List<IndexReader> readers = new ArrayList<IndexReader>();
512 if (searcher.maxDoc() > 0) {
513 ReaderUtil.gatherSubReaders(readers, searcher.getIndexReader());
516 boolean isEmpty = readers.isEmpty();
519 Iterator<String> iter = dict.getWordsIterator();
521 terms: while (iter.hasNext()) {
522 String word = iter.next();
524 int len = word.length();
526 continue; // too short we bail but "too long" is fine...
530 // we have a non-empty index, check if the term exists
531 Term term = F_WORD_TERM.createTerm(word);
532 for (IndexReader ir : readers) {
533 if (ir.docFreq(term) > 0) {
540 Document doc = createDocument(word, getMin(len), getMax(len));
541 writer.addDocument(doc);
544 releaseSearcher(indexSearcher);
550 // also re-open the spell index to see our own changes when the next suggestion
557 * Indexes the data from the given {@link Dictionary}.
558 * @param dict the dictionary to index
559 * @param mergeFactor mergeFactor to use when indexing
560 * @param ramMB the max amount or memory in MB to use
561 * @throws IOException
563 public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB) throws IOException {
564 indexDictionary(dict, mergeFactor, ramMB, true);
568 * Indexes the data from the given {@link Dictionary}.
569 * @param dict the dictionary to index
570 * @throws IOException
572 public final void indexDictionary(Dictionary dict) throws IOException {
573 indexDictionary(dict, 300, (int)IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB);
576 private static int getMin(int l) {
586 private static int getMax(int l) {
596 private static Document createDocument(String text, int ng1, int ng2) {
597 Document doc = new Document();
598 // the word field is never queried on... its indexed so it can be quickly
599 // checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos
600 Field f = new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED);
601 f.setIndexOptions(IndexOptions.DOCS_ONLY);
602 f.setOmitNorms(true);
603 doc.add(f); // orig term
604 addGram(text, doc, ng1, ng2);
608 private static void addGram(String text, Document doc, int ng1, int ng2) {
609 int len = text.length();
610 for (int ng = ng1; ng <= ng2; ng++) {
611 String key = "gram" + ng;
613 for (int i = 0; i < len - ng + 1; i++) {
614 String gram = text.substring(i, i + ng);
615 Field ngramField = new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED);
616 // spellchecker does not use positional queries, but we want freqs
617 // for scoring these multivalued n-gram fields.
618 ngramField.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
621 // only one term possible in the startXXField, TF/pos and norms aren't needed.
622 Field startField = new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED);
623 startField.setIndexOptions(IndexOptions.DOCS_ONLY);
624 startField.setOmitNorms(true);
629 if (end != null) { // may not be present if len==ng1
630 // only one term possible in the endXXField, TF/pos and norms aren't needed.
631 Field endField = new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED);
632 endField.setIndexOptions(IndexOptions.DOCS_ONLY);
633 endField.setOmitNorms(true);
639 private IndexSearcher obtainSearcher() {
640 synchronized (searcherLock) {
642 searcher.getIndexReader().incRef();
647 private void releaseSearcher(final IndexSearcher aSearcher) throws IOException{
648 // don't check if open - always decRef
649 // don't decrement the private searcher - could have been swapped
650 aSearcher.getIndexReader().decRef();
653 private void ensureOpen() {
655 throw new AlreadyClosedException("Spellchecker has been closed");
660 * Close the IndexSearcher used by this SpellChecker
661 * @throws IOException if the close operation causes an {@link IOException}
662 * @throws AlreadyClosedException if the {@link SpellChecker} is already closed
664 public void close() throws IOException {
665 synchronized (searcherLock) {
668 if (searcher != null) {
675 private void swapSearcher(final Directory dir) throws IOException {
677 * opening a searcher is possibly very expensive.
678 * We rather close it again if the Spellchecker was closed during
679 * this operation than block access to the current searcher while opening.
681 final IndexSearcher indexSearcher = createSearcher(dir);
682 synchronized (searcherLock) {
684 indexSearcher.close();
685 throw new AlreadyClosedException("Spellchecker has been closed");
687 if (searcher != null) {
690 // set the spellindex in the sync block - ensure consistency.
691 searcher = indexSearcher;
692 this.spellIndex = dir;
697 * Creates a new read-only IndexSearcher
698 * @param dir the directory used to open the searcher
699 * @return a new read-only IndexSearcher
700 * @throws IOException f there is a low-level IO error
702 // for testing purposes
703 IndexSearcher createSearcher(final Directory dir) throws IOException{
704 return new IndexSearcher(dir, true);
708 * Returns <code>true</code> if and only if the {@link SpellChecker} is
709 * closed, otherwise <code>false</code>.
711 * @return <code>true</code> if and only if the {@link SpellChecker} is
712 * closed, otherwise <code>false</code>.