lucene-java-3.4.0/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java

   1 package org.apache.lucene.misc;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.index.IndexReader;
  21 import org.apache.lucene.index.Term;
  22 import org.apache.lucene.index.TermDocs;
  23 import org.apache.lucene.index.TermEnum;
  24 import org.apache.lucene.store.FSDirectory;
  25 import org.apache.lucene.util.PriorityQueue;
  26 import java.util.Arrays;
  27 import java.util.Comparator;
  28
  29 import java.io.File;
  30
  31 /**
  32  * <code>HighFreqTerms</code> class extracts the top n most frequent terms
  33  * (by document frequency ) from an existing Lucene index and reports their
  34  * document frequency.  If used with the -t flag it also reports their
  35  * total tf (total number of occurences) in order of highest total tf
  36  */
  37 public class HighFreqTerms {
  38
  39    // The top numTerms will be displayed
  40   public static final int DEFAULTnumTerms = 100;
  41   public static int numTerms = DEFAULTnumTerms;
  42
  43   public static void main(String[] args) throws Exception {
  44     IndexReader reader = null;
  45     FSDirectory dir = null;
  46     String field = null;
  47     boolean IncludeTermFreqs = false;
  48
  49     if (args.length == 0 || args.length > 4) {
  50       usage();
  51       System.exit(1);
  52     }
  53
  54     if (args.length > 0) {
  55       dir = FSDirectory.open(new File(args[0]));
  56     }
  57
  58     for (int i = 1; i < args.length; i++) {
  59       if (args[i].equals("-t")) {
  60         IncludeTermFreqs = true;
  61       }
  62       else{
  63         try {
  64           numTerms = Integer.parseInt(args[i]);
  65         } catch (NumberFormatException e) {
  66           field=args[i];
  67         }
  68       }
  69     }
  70
  71
  72     reader = IndexReader.open(dir, true);
  73     TermStats[] terms = getHighFreqTerms(reader, numTerms, field);
  74     /*
  75      * Insert logic so it will only lookup totaltf if right arg
  76      * also change names as in flex
  77      */
  78     if (!IncludeTermFreqs) {
  79       //default HighFreqTerms behavior
  80       for (int i = 0; i < terms.length; i++) {
  81         System.out.printf("%s %,d \n",
  82             terms[i].term, terms[i].docFreq);
  83       }
  84     } else {
  85
  86       TermStats[] termsWithTF = sortByTotalTermFreq(reader, terms);
  87       for (int i = 0; i < termsWithTF.length; i++) {
  88         System.out.printf("%s \t total_tf = %,d \t doc freq = %,d \n",
  89                           termsWithTF[i].term, termsWithTF[i].totalTermFreq, termsWithTF[i].docFreq);
  90       }
  91     }
  92
  93     reader.close();
  94  }
  95
  96   private static void usage() {
  97     System.out
  98         .println("\n\n"
  99             + "java org.apache.lucene.misc.HighFreqTerms <index dir> [-t] [number_terms] [field]\n\t -t: include totalTermFreq\n\n");
 100   }
 101
 102   /**
 103    *
 104    * @param reader
 105    * @param numTerms
 106    * @param field
 107    * @return TermStats[] ordered by terms with highest docFreq first.
 108    * @throws Exception
 109    */
 110   public static TermStats[] getHighFreqTerms(IndexReader reader,
 111       int numTerms, String field) throws Exception {
 112
 113     TermInfoWiTFQueue tiq = new TermInfoWiTFQueue(numTerms);
 114     if (field != null) {
 115       TermEnum terms = reader.terms(new Term(field));
 116       if (terms != null && terms.term() != null) {
 117         do {
 118           if (!terms.term().field().equals(field)) {
 119             break;
 120           }
 121           tiq.insertWithOverflow(new TermStats(terms.term(), terms.docFreq()));
 122         } while (terms.next());
 123       } else {
 124         System.out.println("No terms for field \"" + field + "\"");
 125       }
 126     } else {
 127       TermEnum terms = reader.terms();
 128       while (terms.next()) {
 129         tiq.insertWithOverflow(new TermStats(terms.term(), terms.docFreq()));
 130       }
 131     }
 132
 133     TermStats[] result = new TermStats[tiq.size()];
 134
 135     // we want highest first so we read the queue and populate the array
 136     // starting at the end and work backwards
 137     int count = tiq.size() - 1;
 138     while (tiq.size() != 0) {
 139       result[count] = tiq.pop();
 140       count--;
 141     }
 142     return result;
 143   }
 144
 145   /**
 146    * Takes array of TermStats. For each term looks up the tf for each doc
 147    * containing the term and stores the total in the output array of TermStats.
 148    * Output array is sorted by highest total tf.
 149    *
 150    * @param reader
 151    * @param terms
 152    *          TermStats[]
 153    * @return TermStats[]
 154    * @throws Exception
 155    */
 156
 157   public static TermStats[] sortByTotalTermFreq(IndexReader reader, TermStats[] terms) throws Exception {
 158     TermStats[] ts = new TermStats[terms.length]; // array for sorting
 159     long totalTF;
 160     for (int i = 0; i < terms.length; i++) {
 161       totalTF = getTotalTermFreq(reader, terms[i].term);
 162       ts[i] = new TermStats( terms[i].term, terms[i].docFreq, totalTF);
 163     }
 164
 165     Comparator<TermStats> c = new TotalTermFreqComparatorSortDescending();
 166     Arrays.sort(ts, c);
 167
 168     return ts;
 169   }
 170
 171   public static long getTotalTermFreq(IndexReader reader, Term term) throws Exception {
 172     long totalTF = 0;
 173     TermDocs td = reader.termDocs(term);
 174     while (td.next()) {
 175       totalTF += td.freq();
 176     }
 177     return totalTF;
 178   }
 179 }
 180
 181
 182 final class TermStats {
 183   public Term term;
 184   public int docFreq;
 185   public long totalTermFreq;
 186
 187   public TermStats(Term t, int df) {
 188     this.term = t;
 189     this.docFreq = df;
 190   }
 191
 192   public TermStats(Term t, int df, long tf) {
 193     this.term = t;
 194     this.docFreq = df;
 195     this.totalTermFreq = tf;
 196   }
 197 }
 198
 199
 200 /**
 201  * Priority queue for TermStats objects ordered by TermStats.docFreq
 202  **/
 203 final class TermInfoWiTFQueue extends PriorityQueue<TermStats> {
 204   TermInfoWiTFQueue(int size) {
 205     initialize(size);
 206   }
 207
 208   @Override
 209   protected boolean lessThan(TermStats termInfoA,
 210       TermStats termInfoB) {
 211     return termInfoA.docFreq < termInfoB.docFreq;
 212   }
 213 }
 214
 215 /**
 216  * Comparator
 217  *
 218  * Reverse of normal Comparator. i.e. returns 1 if a.totalTermFreq is less than
 219  * b.totalTermFreq So we can sort in descending order of totalTermFreq
 220  */
 221 final class TotalTermFreqComparatorSortDescending implements Comparator<TermStats> {
 222
 223   public int compare(TermStats a, TermStats b) {
 224     if (a.totalTermFreq < b.totalTermFreq) {
 225       return 1;
 226     } else if (a.totalTermFreq > b.totalTermFreq) {
 227       return -1;
 228     } else {
 229       return 0;
 230     }
 231   }
 232 }
 233