pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / spellchecker / src / java / org / apache / lucene / search / spell / HighFrequencyDictionary.java
diff --git a/lucene-java-3.5.0/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java b/lucene-java-3.5.0/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java

new file mode 100644 (file)

index 0000000..7db6ba3
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.spell;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.search.spell.Dictionary;
+import org.apache.lucene.util.StringHelper;
+
+/**
+ * HighFrequencyDictionary: terms taken from the given field
+ * of a Lucene index, which appear in a number of documents
+ * above a given threshold.
+ *
+ * When using IndexReader.terms(Term) the code must not call next() on TermEnum
+ * as the first call to TermEnum, see: http://issues.apache.org/jira/browse/LUCENE-6
+ *
+ * Threshold is a value in [0..1] representing the minimum
+ * number of documents (of the total) where a term should appear.
+ * 
+ * Based on LuceneDictionary.
+ */
+public class HighFrequencyDictionary implements Dictionary {
+  private IndexReader reader;
+  private String field;
+  private float thresh;
+
+  public HighFrequencyDictionary(IndexReader reader, String field, float thresh) {
+    this.reader = reader;
+    this.field = StringHelper.intern(field);
+    this.thresh = thresh;
+  }
+
+  public final Iterator<String> getWordsIterator() {
+    return new HighFrequencyIterator();
+  }
+
+  final class HighFrequencyIterator implements TermFreqIterator {
+    private TermEnum termEnum;
+    private Term actualTerm;
+    private int actualFreq;
+    private boolean hasNextCalled;
+    private int minNumDocs;
+
+    HighFrequencyIterator() {
+      try {
+        termEnum = reader.terms(new Term(field, ""));
+        minNumDocs = (int)(thresh * (float)reader.numDocs());
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }
+
+    private boolean isFrequent(Term term) {
+      try {
+        return reader.docFreq(term) >= minNumDocs;
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }
+
+    public String next() {
+      if (!hasNextCalled) {
+        hasNext();
+      }
+      hasNextCalled = false;
+
+      try {
+        termEnum.next();
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+
+      return (actualTerm != null) ? actualTerm.text() : null;
+    }
+
+    public float freq() {
+      return actualFreq;
+    }
+
+
+    public boolean hasNext() {
+      if (hasNextCalled) {
+        return actualTerm != null;
+      }
+      hasNextCalled = true;
+
+      do {
+        actualTerm = termEnum.term();
+        actualFreq = termEnum.docFreq();
+
+        // if there are no words return false
+        if (actualTerm == null) {
+          return false;
+        }
+
+        String currentField = actualTerm.field();
+
+        // if the next word doesn't have the same field return false
+        if (currentField != field) {   // intern'd comparison
+          actualTerm = null;
+          return false;
+        }
+
+        // got a valid term, does it pass the threshold?
+        if (isFrequent(actualTerm)) {
+          return true;
+        }
+
+        // term not up to threshold
+        try {
+          termEnum.next();
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+
+      } while (true);
+    }
+
+    public void remove() {
+      throw new UnsupportedOperationException();
+    }
+  }
+}