--- /dev/null
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.spell;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.search.spell.Dictionary;
+import org.apache.lucene.util.StringHelper;
+
+/**
+ * HighFrequencyDictionary: terms taken from the given field
+ * of a Lucene index, which appear in a number of documents
+ * above a given threshold.
+ *
+ * When using IndexReader.terms(Term) the code must not call next() on TermEnum
+ * as the first call to TermEnum, see: http://issues.apache.org/jira/browse/LUCENE-6
+ *
+ * Threshold is a value in [0..1] representing the minimum
+ * number of documents (of the total) where a term should appear.
+ *
+ * Based on LuceneDictionary.
+ */
+public class HighFrequencyDictionary implements Dictionary {
+ private IndexReader reader;
+ private String field;
+ private float thresh;
+
+ public HighFrequencyDictionary(IndexReader reader, String field, float thresh) {
+ this.reader = reader;
+ this.field = StringHelper.intern(field);
+ this.thresh = thresh;
+ }
+
+ public final Iterator<String> getWordsIterator() {
+ return new HighFrequencyIterator();
+ }
+
+ final class HighFrequencyIterator implements TermFreqIterator {
+ private TermEnum termEnum;
+ private Term actualTerm;
+ private int actualFreq;
+ private boolean hasNextCalled;
+ private int minNumDocs;
+
+ HighFrequencyIterator() {
+ try {
+ termEnum = reader.terms(new Term(field, ""));
+ minNumDocs = (int)(thresh * (float)reader.numDocs());
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private boolean isFrequent(Term term) {
+ try {
+ return reader.docFreq(term) >= minNumDocs;
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public String next() {
+ if (!hasNextCalled) {
+ hasNext();
+ }
+ hasNextCalled = false;
+
+ try {
+ termEnum.next();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ return (actualTerm != null) ? actualTerm.text() : null;
+ }
+
+ public float freq() {
+ return actualFreq;
+ }
+
+
+ public boolean hasNext() {
+ if (hasNextCalled) {
+ return actualTerm != null;
+ }
+ hasNextCalled = true;
+
+ do {
+ actualTerm = termEnum.term();
+ actualFreq = termEnum.docFreq();
+
+ // if there are no words return false
+ if (actualTerm == null) {
+ return false;
+ }
+
+ String currentField = actualTerm.field();
+
+ // if the next word doesn't have the same field return false
+ if (currentField != field) { // intern'd comparison
+ actualTerm = null;
+ return false;
+ }
+
+ // got a valid term, does it pass the threshold?
+ if (isFrequent(actualTerm)) {
+ return true;
+ }
+
+ // term not up to threshold
+ try {
+ termEnum.next();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ } while (true);
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+}