2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.apache.lucene.search.spell;
20 import java.io.IOException;
21 import java.util.Iterator;
23 import org.apache.lucene.index.IndexReader;
24 import org.apache.lucene.index.Term;
25 import org.apache.lucene.index.TermEnum;
26 import org.apache.lucene.search.spell.Dictionary;
27 import org.apache.lucene.util.StringHelper;
30 * HighFrequencyDictionary: terms taken from the given field
31 * of a Lucene index, which appear in a number of documents
32 * above a given threshold.
34 * When using IndexReader.terms(Term) the code must not call next() on TermEnum
35 * as the first call to TermEnum, see: http://issues.apache.org/jira/browse/LUCENE-6
37 * Threshold is a value in [0..1] representing the minimum
38 * number of documents (of the total) where a term should appear.
40 * Based on LuceneDictionary.
42 public class HighFrequencyDictionary implements Dictionary {
43 private IndexReader reader;
47 public HighFrequencyDictionary(IndexReader reader, String field, float thresh) {
49 this.field = StringHelper.intern(field);
53 public final Iterator<String> getWordsIterator() {
54 return new HighFrequencyIterator();
57 final class HighFrequencyIterator implements TermFreqIterator {
58 private TermEnum termEnum;
59 private Term actualTerm;
60 private int actualFreq;
61 private boolean hasNextCalled;
62 private int minNumDocs;
64 HighFrequencyIterator() {
66 termEnum = reader.terms(new Term(field, ""));
67 minNumDocs = (int)(thresh * (float)reader.numDocs());
68 } catch (IOException e) {
69 throw new RuntimeException(e);
73 private boolean isFrequent(Term term) {
75 return reader.docFreq(term) >= minNumDocs;
76 } catch (IOException e) {
77 throw new RuntimeException(e);
81 public String next() {
85 hasNextCalled = false;
89 } catch (IOException e) {
90 throw new RuntimeException(e);
93 return (actualTerm != null) ? actualTerm.text() : null;
101 public boolean hasNext() {
103 return actualTerm != null;
105 hasNextCalled = true;
108 actualTerm = termEnum.term();
109 actualFreq = termEnum.docFreq();
111 // if there are no words return false
112 if (actualTerm == null) {
116 String currentField = actualTerm.field();
118 // if the next word doesn't have the same field return false
119 if (currentField != field) { // intern'd comparison
124 // got a valid term, does it pass the threshold?
125 if (isFrequent(actualTerm)) {
129 // term not up to threshold
132 } catch (IOException e) {
133 throw new RuntimeException(e);
139 public void remove() {
140 throw new UnsupportedOperationException();