lucene-java-3.5.0/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java

   1 package org.apache.lucene.search.spell;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.ArrayList;
  22 import java.util.Comparator;
  23 import java.util.Iterator;
  24 import java.util.List;
  25
  26 import org.apache.lucene.analysis.WhitespaceAnalyzer;
  27 import org.apache.lucene.document.Document;
  28 import org.apache.lucene.document.Field;
  29 import org.apache.lucene.index.FieldInfo.IndexOptions;
  30 import org.apache.lucene.index.IndexReader;
  31 import org.apache.lucene.index.IndexWriter;
  32 import org.apache.lucene.index.IndexWriterConfig;
  33 import org.apache.lucene.index.Term;
  34 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
  35 import org.apache.lucene.search.BooleanClause;
  36 import org.apache.lucene.search.BooleanQuery;
  37 import org.apache.lucene.search.IndexSearcher;
  38 import org.apache.lucene.search.Query;
  39 import org.apache.lucene.search.ScoreDoc;
  40 import org.apache.lucene.search.TermQuery;
  41 import org.apache.lucene.store.AlreadyClosedException;
  42 import org.apache.lucene.store.Directory;
  43 import org.apache.lucene.util.ReaderUtil;
  44 import org.apache.lucene.util.Version;
  45
  46 /**
  47  * <p>
  48  *   Spell Checker class  (Main class) <br/>
  49  *  (initially inspired by the David Spencer code).
  50  * </p>
  51  *
  52  * <p>Example Usage:
  53  *
  54  * <pre>
  55  *  SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
  56  *  // To index a field of a user index:
  57  *  spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
  58  *  // To index a file containing words:
  59  *  spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
  60  *  String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
  61  * </pre>
  62  *
  63  *
  64  * @version 1.0
  65  */
  66 public class SpellChecker implements java.io.Closeable {
  67
  68   /**
  69    * The default minimum score to use, if not specified by calling {@link #setAccuracy(float)} .
  70    */
  71   public static final float DEFAULT_ACCURACY = 0.5f;
  72
  73   /**
  74    * Field name for each word in the ngram index.
  75    */
  76   public static final String F_WORD = "word";
  77
  78   private static final Term F_WORD_TERM = new Term(F_WORD);
  79
  80   /**
  81    * the spell index
  82    */
  83   // don't modify the directory directly - see #swapSearcher()
  84   // TODO: why is this package private?
  85   Directory spellIndex;
  86   /**
  87    * Boost value for start and end grams
  88    */
  89   private float bStart = 2.0f;
  90
  91   private float bEnd = 1.0f;
  92   // don't use this searcher directly - see #swapSearcher()
  93
  94   private IndexSearcher searcher;
  95   /*
  96    * this locks all modifications to the current searcher.
  97    */
  98
  99   private final Object searcherLock = new Object();
 100   /*
 101    * this lock synchronizes all possible modifications to the
 102    * current index directory. It should not be possible to try modifying
 103    * the same index concurrently. Note: Do not acquire the searcher lock
 104    * before acquiring this lock!
 105    */
 106   private final Object modifyCurrentIndexLock = new Object();
 107
 108   private volatile boolean closed = false;
 109   // minimum score for hits generated by the spell checker query
 110
 111   private float accuracy = DEFAULT_ACCURACY;
 112
 113   private StringDistance sd;
 114   private Comparator<SuggestWord> comparator;
 115
 116   /**
 117    * Use the given directory as a spell checker index. The directory
 118    * is created if it doesn't exist yet.
 119    * @param spellIndex the spell index directory
 120    * @param sd the {@link StringDistance} measurement to use
 121    * @throws IOException if Spellchecker can not open the directory
 122    */
 123   public SpellChecker(Directory spellIndex, StringDistance sd) throws IOException {
 124     this(spellIndex, sd, SuggestWordQueue.DEFAULT_COMPARATOR);
 125   }
 126   /**
 127    * Use the given directory as a spell checker index with a
 128    * {@link LevensteinDistance} as the default {@link StringDistance}. The
 129    * directory is created if it doesn't exist yet.
 130    *
 131    * @param spellIndex
 132    *          the spell index directory
 133    * @throws IOException
 134    *           if spellchecker can not open the directory
 135    */
 136   public SpellChecker(Directory spellIndex) throws IOException {
 137     this(spellIndex, new LevensteinDistance());
 138   }
 139
 140   /**
 141    * Use the given directory as a spell checker index with the given {@link org.apache.lucene.search.spell.StringDistance} measure
 142    * and the given {@link java.util.Comparator} for sorting the results.
 143    * @param spellIndex The spelling index
 144    * @param sd The distance
 145    * @param comparator The comparator
 146    * @throws IOException if there is a problem opening the index
 147    */
 148   public SpellChecker(Directory spellIndex, StringDistance sd, Comparator<SuggestWord> comparator) throws IOException {
 149     setSpellIndex(spellIndex);
 150     setStringDistance(sd);
 151     this.comparator = comparator;
 152   }
 153
 154   /**
 155    * Use a different index as the spell checker index or re-open
 156    * the existing index if <code>spellIndex</code> is the same value
 157    * as given in the constructor.
 158    * @param spellIndexDir the spell directory to use
 159    * @throws AlreadyClosedException if the Spellchecker is already closed
 160    * @throws  IOException if spellchecker can not open the directory
 161    */
 162   // TODO: we should make this final as it is called in the constructor
 163   public void setSpellIndex(Directory spellIndexDir) throws IOException {
 164     // this could be the same directory as the current spellIndex
 165     // modifications to the directory should be synchronized
 166     synchronized (modifyCurrentIndexLock) {
 167       ensureOpen();
 168       if (!IndexReader.indexExists(spellIndexDir)) {
 169           IndexWriter writer = new IndexWriter(spellIndexDir,
 170             new IndexWriterConfig(Version.LUCENE_CURRENT,
 171                 new WhitespaceAnalyzer(Version.LUCENE_CURRENT)));
 172           writer.close();
 173       }
 174       swapSearcher(spellIndexDir);
 175     }
 176   }
 177
 178   /**
 179    * Sets the {@link java.util.Comparator} for the {@link SuggestWordQueue}.
 180    * @param comparator the comparator
 181    */
 182   public void setComparator(Comparator<SuggestWord> comparator) {
 183     this.comparator = comparator;
 184   }
 185
 186   public Comparator<SuggestWord> getComparator() {
 187     return comparator;
 188   }
 189
 190   /**
 191    * Sets the {@link StringDistance} implementation for this
 192    * {@link SpellChecker} instance.
 193    *
 194    * @param sd the {@link StringDistance} implementation for this
 195    * {@link SpellChecker} instance
 196    */
 197   public void setStringDistance(StringDistance sd) {
 198     this.sd = sd;
 199   }
 200   /**
 201    * Returns the {@link StringDistance} instance used by this
 202    * {@link SpellChecker} instance.
 203    *
 204    * @return the {@link StringDistance} instance used by this
 205    *         {@link SpellChecker} instance.
 206    */
 207   public StringDistance getStringDistance() {
 208     return sd;
 209   }
 210
 211   /**
 212    * Sets the accuracy 0 &lt; minScore &lt; 1; default {@link #DEFAULT_ACCURACY}
 213    * @param acc The new accuracy
 214    */
 215   public void setAccuracy(float acc) {
 216     this.accuracy = acc;
 217   }
 218
 219   /**
 220    * The accuracy (minimum score) to be used, unless overridden in {@link #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)}, to
 221    * decide whether a suggestion is included or not.
 222    * @return The current accuracy setting
 223    */
 224   public float getAccuracy() {
 225     return accuracy;
 226   }
 227
 228   /**
 229    * Suggest similar words.
 230    *
 231    * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
 232    * is not the same as the edit distance strategy used to calculate the best
 233    * matching spell-checked word from the hits that Lucene found, one usually has
 234    * to retrieve a couple of numSug's in order to get the true best match.
 235    *
 236    * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
 237    * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
 238    *
 239    * @param word the word you want a spell check done on
 240    * @param numSug the number of suggested words
 241    * @throws IOException if the underlying index throws an {@link IOException}
 242    * @throws AlreadyClosedException if the Spellchecker is already closed
 243    * @return String[]
 244    *
 245    * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
 246    */
 247   public String[] suggestSimilar(String word, int numSug) throws IOException {
 248     return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
 249   }
 250
 251   /**
 252    * Suggest similar words.
 253    *
 254    * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
 255    * is not the same as the edit distance strategy used to calculate the best
 256    * matching spell-checked word from the hits that Lucene found, one usually has
 257    * to retrieve a couple of numSug's in order to get the true best match.
 258    *
 259    * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
 260    * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
 261    *
 262    * @param word the word you want a spell check done on
 263    * @param numSug the number of suggested words
 264    * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
 265    * @throws IOException if the underlying index throws an {@link IOException}
 266    * @throws AlreadyClosedException if the Spellchecker is already closed
 267    * @return String[]
 268    *
 269    * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
 270    */
 271   public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException {
 272     return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, accuracy);
 273   }
 274
 275   /**
 276    * Suggest similar words (optionally restricted to a field of an index).
 277    *
 278    * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
 279    * is not the same as the edit distance strategy used to calculate the best
 280    * matching spell-checked word from the hits that Lucene found, one usually has
 281    * to retrieve a couple of numSug's in order to get the true best match.
 282    *
 283    * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
 284    * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
 285    *
 286    * <p>Uses the {@link #getAccuracy()} value passed into the constructor as the accuracy.
 287    *
 288    * @param word the word you want a spell check done on
 289    * @param numSug the number of suggested words
 290    * @param ir the indexReader of the user index (can be null see field param)
 291    * @param field the field of the user index: if field is not null, the suggested
 292    * words are restricted to the words present in this field.
 293    * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word
 294    * (only if restricted mode = (indexReader!=null and field!=null)
 295    * @throws IOException if the underlying index throws an {@link IOException}
 296    * @throws AlreadyClosedException if the Spellchecker is already closed
 297    * @return String[] the sorted list of the suggest words with these 2 criteria:
 298    * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
 299    * of the suggest words in the field of the user index
 300    *
 301    * @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
 302    *
 303    * @deprecated
 304    *  use suggestSimilar(String, int, IndexReader, String, SuggestMode)
 305    *  <ul>
 306          *      <li>SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX instead of morePopular=false</li>
 307          *      <li>SuggestMode.SuGGEST_MORE_POPULAR instead of morePopular=true</li>
 308    *  </ul>
 309    */
 310   @Deprecated
 311   public String[] suggestSimilar(String word, int numSug, IndexReader ir,
 312       String field, boolean morePopular) throws IOException {
 313     return suggestSimilar(word, numSug, ir, field, morePopular, accuracy);
 314   }
 315
 316
 317   /**
 318    * Suggest similar words (optionally restricted to a field of an index).
 319    *
 320    * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
 321    * is not the same as the edit distance strategy used to calculate the best
 322    * matching spell-checked word from the hits that Lucene found, one usually has
 323    * to retrieve a couple of numSug's in order to get the true best match.
 324    *
 325    * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
 326    * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
 327    *
 328    * @param word the word you want a spell check done on
 329    * @param numSug the number of suggested words
 330    * @param ir the indexReader of the user index (can be null see field param)
 331    * @param field the field of the user index: if field is not null, the suggested
 332    * words are restricted to the words present in this field.
 333    * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word
 334    * (only if restricted mode = (indexReader!=null and field!=null)
 335    * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
 336    * @throws IOException if the underlying index throws an {@link IOException}
 337    * @throws AlreadyClosedException if the Spellchecker is already closed
 338    * @return String[] the sorted list of the suggest words with these 2 criteria:
 339    * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
 340    * of the suggest words in the field of the user index
 341    *
 342    * @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
 343    *
 344    * @deprecated
 345    *  use suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
 346    *  <ul>
 347          *      <li>SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX instead of morePopular=false</li>
 348          *      <li>SuggestMode.SuGGEST_MORE_POPULAR instead of morePopular=true</li>
 349    *  </ul>
 350    */
 351   @Deprecated
 352   public String[] suggestSimilar(String word, int numSug, IndexReader ir,
 353       String field, boolean morePopular, float accuracy) throws IOException {
 354         return suggestSimilar(word, numSug, ir, field, morePopular ? SuggestMode.SUGGEST_MORE_POPULAR :
 355                 SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, accuracy);
 356   }
 357
 358   /**
 359    * Calls {@link #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
 360    *       suggestSimilar(word, numSug, ir, suggestMode, field, this.accuracy)}
 361    *
 362    */
 363   public String[] suggestSimilar(String word, int numSug, IndexReader ir,
 364       String field, SuggestMode suggestMode) throws IOException {
 365         return suggestSimilar(word, numSug, ir, field, suggestMode, this.accuracy);
 366   }
 367
 368   /**
 369    * Suggest similar words (optionally restricted to a field of an index).
 370    *
 371    * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
 372    * is not the same as the edit distance strategy used to calculate the best
 373    * matching spell-checked word from the hits that Lucene found, one usually has
 374    * to retrieve a couple of numSug's in order to get the true best match.
 375    *
 376    * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
 377    * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
 378    *
 379    * @param word the word you want a spell check done on
 380    * @param numSug the number of suggested words
 381    * @param ir the indexReader of the user index (can be null see field param)
 382    * @param field the field of the user index: if field is not null, the suggested
 383    * words are restricted to the words present in this field.
 384    * @param suggestMode
 385    * (NOTE: if indexReader==null and/or field==null, then this is overridden with SuggestMode.SUGGEST_ALWAYS)
 386    * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
 387    * @throws IOException if the underlying index throws an {@link IOException}
 388    * @throws AlreadyClosedException if the Spellchecker is already closed
 389    * @return String[] the sorted list of the suggest words with these 2 criteria:
 390    * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
 391    * of the suggest words in the field of the user index
 392    *
 393    */
 394   public String[] suggestSimilar(String word, int numSug, IndexReader ir,
 395       String field, SuggestMode suggestMode, float accuracy) throws IOException {
 396     // obtainSearcher calls ensureOpen
 397     final IndexSearcher indexSearcher = obtainSearcher();
 398     try {
 399       if (ir == null || field == null) {
 400         suggestMode = SuggestMode.SUGGEST_ALWAYS;
 401       }
 402       if (suggestMode == SuggestMode.SUGGEST_ALWAYS) {
 403         ir = null;
 404         field = null;
 405       }
 406
 407       final int lengthWord = word.length();
 408
 409       final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
 410       final int goalFreq = suggestMode==SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0;
 411       // if the word exists in the real index and we don't care for word frequency, return the word itself
 412       if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0) {
 413         return new String[] { word };
 414       }
 415
 416       BooleanQuery query = new BooleanQuery();
 417       String[] grams;
 418       String key;
 419
 420       for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {
 421
 422         key = "gram" + ng; // form key
 423
 424         grams = formGrams(word, ng); // form word into ngrams (allow dups too)
 425
 426         if (grams.length == 0) {
 427           continue; // hmm
 428         }
 429
 430         if (bStart > 0) { // should we boost prefixes?
 431           add(query, "start" + ng, grams[0], bStart); // matches start of word
 432
 433         }
 434         if (bEnd > 0) { // should we boost suffixes
 435           add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word
 436
 437         }
 438         for (int i = 0; i < grams.length; i++) {
 439           add(query, key, grams[i]);
 440         }
 441       }
 442
 443       int maxHits = 10 * numSug;
 444
 445   //    System.out.println("Q: " + query);
 446       ScoreDoc[] hits = indexSearcher.search(query, null, maxHits).scoreDocs;
 447   //    System.out.println("HITS: " + hits.length());
 448       SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator);
 449
 450       // go thru more than 'maxr' matches in case the distance filter triggers
 451       int stop = Math.min(hits.length, maxHits);
 452       SuggestWord sugWord = new SuggestWord();
 453       for (int i = 0; i < stop; i++) {
 454
 455         sugWord.string = indexSearcher.doc(hits[i].doc).get(F_WORD); // get orig word
 456
 457         // don't suggest a word for itself, that would be silly
 458         if (sugWord.string.equals(word)) {
 459           continue;
 460         }
 461
 462         // edit distance
 463         sugWord.score = sd.getDistance(word,sugWord.string);
 464         if (sugWord.score < accuracy) {
 465           continue;
 466         }
 467
 468         if (ir != null && field != null) { // use the user index
 469           sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
 470           // don't suggest a word that is not present in the field
 471           if ((suggestMode==SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq) || sugWord.freq < 1) {
 472             continue;
 473           }
 474         }
 475         sugQueue.insertWithOverflow(sugWord);
 476         if (sugQueue.size() == numSug) {
 477           // if queue full, maintain the minScore score
 478           accuracy = sugQueue.top().score;
 479         }
 480         sugWord = new SuggestWord();
 481       }
 482
 483       // convert to array string
 484       String[] list = new String[sugQueue.size()];
 485       for (int i = sugQueue.size() - 1; i >= 0; i--) {
 486         list[i] = sugQueue.pop().string;
 487       }
 488
 489       return list;
 490     } finally {
 491       releaseSearcher(indexSearcher);
 492     }
 493   }
 494   /**
 495    * Add a clause to a boolean query.
 496    */
 497   private static void add(BooleanQuery q, String name, String value, float boost) {
 498     Query tq = new TermQuery(new Term(name, value));
 499     tq.setBoost(boost);
 500     q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
 501   }
 502
 503   /**
 504    * Add a clause to a boolean query.
 505    */
 506   private static void add(BooleanQuery q, String name, String value) {
 507     q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
 508   }
 509
 510   /**
 511    * Form all ngrams for a given word.
 512    * @param text the word to parse
 513    * @param ng the ngram length e.g. 3
 514    * @return an array of all ngrams in the word and note that duplicates are not removed
 515    */
 516   private static String[] formGrams(String text, int ng) {
 517     int len = text.length();
 518     String[] res = new String[len - ng + 1];
 519     for (int i = 0; i < len - ng + 1; i++) {
 520       res[i] = text.substring(i, i + ng);
 521     }
 522     return res;
 523   }
 524
 525   /**
 526    * Removes all terms from the spell check index.
 527    * @throws IOException
 528    * @throws AlreadyClosedException if the Spellchecker is already closed
 529    */
 530   public void clearIndex() throws IOException {
 531     synchronized (modifyCurrentIndexLock) {
 532       ensureOpen();
 533       final Directory dir = this.spellIndex;
 534       final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
 535           Version.LUCENE_CURRENT,
 536           new WhitespaceAnalyzer(Version.LUCENE_CURRENT))
 537           .setOpenMode(OpenMode.CREATE));
 538       writer.close();
 539       swapSearcher(dir);
 540     }
 541   }
 542
 543   /**
 544    * Check whether the word exists in the index.
 545    * @param word
 546    * @throws IOException
 547    * @throws AlreadyClosedException if the Spellchecker is already closed
 548    * @return true if the word exists in the index
 549    */
 550   public boolean exist(String word) throws IOException {
 551     // obtainSearcher calls ensureOpen
 552     final IndexSearcher indexSearcher = obtainSearcher();
 553     try{
 554       return indexSearcher.docFreq(F_WORD_TERM.createTerm(word)) > 0;
 555     } finally {
 556       releaseSearcher(indexSearcher);
 557     }
 558   }
 559
 560   /**
 561    * Indexes the data from the given {@link Dictionary}.
 562    * @param dict Dictionary to index
 563    * @param config {@link IndexWriterConfig} to use
 564    * @param fullMerge whether or not the spellcheck index should be fully merged
 565    * @throws AlreadyClosedException if the Spellchecker is already closed
 566    * @throws IOException
 567    */
 568   public final void indexDictionary(Dictionary dict, IndexWriterConfig config, boolean fullMerge) throws IOException {
 569     synchronized (modifyCurrentIndexLock) {
 570       ensureOpen();
 571       final Directory dir = this.spellIndex;
 572       final IndexWriter writer = new IndexWriter(dir, config);
 573       IndexSearcher indexSearcher = obtainSearcher();
 574       final List<IndexReader> readers = new ArrayList<IndexReader>();
 575
 576       if (searcher.maxDoc() > 0) {
 577         ReaderUtil.gatherSubReaders(readers, searcher.getIndexReader());
 578       }
 579
 580       boolean isEmpty = readers.isEmpty();
 581
 582       try {
 583         Iterator<String> iter = dict.getWordsIterator();
 584
 585         terms: while (iter.hasNext()) {
 586           String word = iter.next();
 587
 588           int len = word.length();
 589           if (len < 3) {
 590             continue; // too short we bail but "too long" is fine...
 591           }
 592
 593           if (!isEmpty) {
 594             // we have a non-empty index, check if the term exists
 595             Term term = F_WORD_TERM.createTerm(word);
 596             for (IndexReader ir : readers) {
 597               if (ir.docFreq(term) > 0) {
 598                 continue terms;
 599               }
 600             }
 601           }
 602
 603           // ok index the word
 604           Document doc = createDocument(word, getMin(len), getMax(len));
 605           writer.addDocument(doc);
 606         }
 607       } finally {
 608         releaseSearcher(indexSearcher);
 609       }
 610       if (fullMerge) {
 611         writer.forceMerge(1);
 612       }
 613       // close writer
 614       writer.close();
 615       // TODO: this isn't that great, maybe in the future SpellChecker should take
 616       // IWC in its ctor / keep its writer open?
 617
 618       // also re-open the spell index to see our own changes when the next suggestion
 619       // is fetched:
 620       swapSearcher(dir);
 621     }
 622   }
 623
 624   private static int getMin(int l) {
 625     if (l > 5) {
 626       return 3;
 627     }
 628     if (l == 5) {
 629       return 2;
 630     }
 631     return 1;
 632   }
 633
 634   private static int getMax(int l) {
 635     if (l > 5) {
 636       return 4;
 637     }
 638     if (l == 5) {
 639       return 3;
 640     }
 641     return 2;
 642   }
 643
 644   private static Document createDocument(String text, int ng1, int ng2) {
 645     Document doc = new Document();
 646     // the word field is never queried on... its indexed so it can be quickly
 647     // checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos
 648     Field f = new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED);
 649     f.setIndexOptions(IndexOptions.DOCS_ONLY);
 650     f.setOmitNorms(true);
 651     doc.add(f); // orig term
 652     addGram(text, doc, ng1, ng2);
 653     return doc;
 654   }
 655
 656   private static void addGram(String text, Document doc, int ng1, int ng2) {
 657     int len = text.length();
 658     for (int ng = ng1; ng <= ng2; ng++) {
 659       String key = "gram" + ng;
 660       String end = null;
 661       for (int i = 0; i < len - ng + 1; i++) {
 662         String gram = text.substring(i, i + ng);
 663         Field ngramField = new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED);
 664         // spellchecker does not use positional queries, but we want freqs
 665         // for scoring these multivalued n-gram fields.
 666         ngramField.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
 667         doc.add(ngramField);
 668         if (i == 0) {
 669           // only one term possible in the startXXField, TF/pos and norms aren't needed.
 670           Field startField = new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED);
 671           startField.setIndexOptions(IndexOptions.DOCS_ONLY);
 672           startField.setOmitNorms(true);
 673           doc.add(startField);
 674         }
 675         end = gram;
 676       }
 677       if (end != null) { // may not be present if len==ng1
 678         // only one term possible in the endXXField, TF/pos and norms aren't needed.
 679         Field endField = new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED);
 680         endField.setIndexOptions(IndexOptions.DOCS_ONLY);
 681         endField.setOmitNorms(true);
 682         doc.add(endField);
 683       }
 684     }
 685   }
 686
 687   private IndexSearcher obtainSearcher() {
 688     synchronized (searcherLock) {
 689       ensureOpen();
 690       searcher.getIndexReader().incRef();
 691       return searcher;
 692     }
 693   }
 694
 695   private void releaseSearcher(final IndexSearcher aSearcher) throws IOException{
 696       // don't check if open - always decRef
 697       // don't decrement the private searcher - could have been swapped
 698       aSearcher.getIndexReader().decRef();
 699   }
 700
 701   private void ensureOpen() {
 702     if (closed) {
 703       throw new AlreadyClosedException("Spellchecker has been closed");
 704     }
 705   }
 706
 707   /**
 708    * Close the IndexSearcher used by this SpellChecker
 709    * @throws IOException if the close operation causes an {@link IOException}
 710    * @throws AlreadyClosedException if the {@link SpellChecker} is already closed
 711    */
 712   public void close() throws IOException {
 713     synchronized (searcherLock) {
 714       ensureOpen();
 715       closed = true;
 716       if (searcher != null) {
 717         searcher.close();
 718       }
 719       searcher = null;
 720     }
 721   }
 722
 723   private void swapSearcher(final Directory dir) throws IOException {
 724     /*
 725      * opening a searcher is possibly very expensive.
 726      * We rather close it again if the Spellchecker was closed during
 727      * this operation than block access to the current searcher while opening.
 728      */
 729     final IndexSearcher indexSearcher = createSearcher(dir);
 730     synchronized (searcherLock) {
 731       if(closed){
 732         indexSearcher.close();
 733         throw new AlreadyClosedException("Spellchecker has been closed");
 734       }
 735       if (searcher != null) {
 736         searcher.close();
 737       }
 738       // set the spellindex in the sync block - ensure consistency.
 739       searcher = indexSearcher;
 740       this.spellIndex = dir;
 741     }
 742   }
 743
 744   /**
 745    * Creates a new read-only IndexSearcher
 746    * @param dir the directory used to open the searcher
 747    * @return a new read-only IndexSearcher
 748    * @throws IOException f there is a low-level IO error
 749    */
 750   // for testing purposes
 751   IndexSearcher createSearcher(final Directory dir) throws IOException{
 752     return new IndexSearcher(dir, true);
 753   }
 754
 755   /**
 756    * Returns <code>true</code> if and only if the {@link SpellChecker} is
 757    * closed, otherwise <code>false</code>.
 758    *
 759    * @return <code>true</code> if and only if the {@link SpellChecker} is
 760    *         closed, otherwise <code>false</code>.
 761    */
 762   boolean isClosed(){
 763     return closed;
 764   }
 765
 766 }