lucene-java-3.5.0/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java

   1 package org.apache.lucene.misc;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.index.IndexReader;
  21 import org.apache.lucene.index.IndexWriter;
  22 import org.apache.lucene.index.Term;
  23 import org.apache.lucene.util.LuceneTestCase;
  24 import org.apache.lucene.store.Directory;
  25 import org.apache.lucene.analysis.MockAnalyzer;
  26 import org.apache.lucene.analysis.MockTokenizer;
  27 import org.apache.lucene.document.Document;
  28 import org.apache.lucene.document.Field;
  29 import org.junit.AfterClass;
  30 import org.junit.BeforeClass;
  31
  32 public class TestHighFreqTerms extends LuceneTestCase {
  33
  34   private static IndexWriter writer =null;
  35   private static Directory dir = null;
  36   private static IndexReader reader =null;
  37
  38   @BeforeClass
  39   public static void setUpClass() throws Exception {
  40     dir = newDirectory();
  41     writer = new IndexWriter(dir, newIndexWriterConfig(random,
  42        TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false))
  43        .setMaxBufferedDocs(2));
  44    indexDocs(writer);
  45    reader = IndexReader.open(dir, true);
  46   }
  47
  48   @AfterClass
  49   public static void tearDownClass() throws Exception{
  50     reader.close();
  51     dir.close();
  52     dir = null;
  53     reader = null;
  54   }
  55
  56 /******************** Tests for getHighFreqTerms **********************************/
  57
  58   // test without specifying field (i.e. if we pass in field=null it should examine all fields)
  59   // the term "diff" in the field "different_field" occurs 20 times and is the highest df term
  60   public void testFirstTermHighestDocFreqAllFields () throws Exception{
  61     int numTerms = 12;
  62     String field =null;
  63     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
  64     assertEquals("Term with highest docfreq is first", 20,terms[0].docFreq );
  65   }
  66
  67   public void testFirstTermHighestDocFreq () throws Exception{
  68     int numTerms = 12;
  69     String field="FIELD_1";
  70     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
  71     assertEquals("Term with highest docfreq is first", 10,terms[0].docFreq );
  72   }
  73   public void testOrderedByDocFreqDescending () throws Exception{
  74     int numTerms = 12;
  75     String field="FIELD_1";
  76     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
  77     for (int i = 0; i < terms.length; i++) {
  78       if (i >0){
  79        assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
  80       }
  81     }
  82   }
  83
  84   public void testNumTerms () throws Exception{
  85     int numTerms = 12;
  86     String field = null;
  87     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
  88     assertEquals("length of terms array equals numTerms :" + numTerms, numTerms, terms.length);
  89   }
  90
  91   public void testGetHighFreqTerms () throws Exception{
  92     int numTerms=12;
  93     String field="FIELD_1";
  94     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
  95
  96     for (int i = 0; i < terms.length; i++) {
  97       String termtext = terms[i].term.text();
  98       // hardcoded highTF or highTFmedDF
  99       if (termtext.contains("highTF")) {
 100         if (termtext.contains("medDF")) {
 101           assertEquals("doc freq is not as expected", 5, terms[i].docFreq);
 102         } else {
 103           assertEquals("doc freq is not as expected", 1, terms[i].docFreq);
 104         }
 105       } else {
 106         int n = Integer.parseInt(termtext);
 107         assertEquals("doc freq is not as expected", getExpecteddocFreq(n),
 108             terms[i].docFreq);
 109       }
 110     }
 111   }
 112
 113   /********************Test sortByTotalTermFreq**********************************/
 114
 115   public void testFirstTermHighestTotalTermFreq () throws Exception{
 116     int numTerms = 20;
 117     String field = null;
 118     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
 119     TermStats[] termsWithTotalTermFreq = HighFreqTerms.sortByTotalTermFreq(reader, terms);
 120     assertEquals("Term with highest totalTermFreq is first",200, termsWithTotalTermFreq[0].totalTermFreq);
 121   }
 122   public void testFirstTermHighestTotalTermFreqDifferentField () throws Exception{
 123     int numTerms = 20;
 124     String field = "different_field";
 125     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
 126     TermStats[] termsWithTotalTermFreq = HighFreqTerms.sortByTotalTermFreq(reader, terms);
 127     assertEquals("Term with highest totalTermFreq is first"+ termsWithTotalTermFreq[0].term.text(),150, termsWithTotalTermFreq[0].totalTermFreq);
 128   }
 129
 130   public void testOrderedByTermFreqDescending () throws Exception{
 131     int numTerms = 12;
 132     String field = "FIELD_1";
 133     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
 134     TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
 135
 136   for (int i = 0; i < termsWithTF.length; i++) {
 137     // check that they are sorted by descending termfreq order
 138     if (i >0){
 139       assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq > termsWithTF[i].totalTermFreq);
 140      }
 141     }
 142   }
 143
 144   public void testGetTermFreqOrdered () throws Exception{
 145     int numTerms = 12;
 146     String field = "FIELD_1";
 147     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
 148     TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
 149
 150     for (int i = 0; i < termsWithTF.length; i++) {
 151       String text = termsWithTF[i].term.text();
 152       if (text.contains("highTF")) {
 153         if (text.contains("medDF")) {
 154           assertEquals("total term freq is expected", 125,
 155               termsWithTF[i].totalTermFreq);
 156         } else {
 157           assertEquals("total term freq is expected", 200,
 158               termsWithTF[i].totalTermFreq);
 159         }
 160
 161       } else {
 162         int n = Integer.parseInt(text);
 163         assertEquals("doc freq is expected", getExpecteddocFreq(n),
 164             termsWithTF[i].docFreq);
 165         assertEquals("total term freq is expected", getExpectedtotalTermFreq(n),
 166             termsWithTF[i].totalTermFreq);
 167       }
 168     }
 169   }
 170
 171     /********************Tests for getTotalTermFreq**********************************/
 172
 173   public void testGetTotalTermFreq() throws Exception{
 174     String termtext ="highTF";
 175     String field = "FIELD_1";
 176     Term term = new Term(field,termtext);
 177     long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, term);
 178     assertEquals("highTf tf should be 200",200,totalTermFreq);
 179   }
 180
 181   public void testGetTotalTermFreqBadTerm() throws Exception{
 182     String termtext ="foobar";
 183     String field = "FIELD_1";
 184     Term term = new Term(field,termtext);
 185     long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, term);
 186     assertEquals("totalTermFreq should be 0 for term not in index",0,totalTermFreq);
 187   }
 188     /********************Testing Utils**********************************/
 189
 190   private static void indexDocs(IndexWriter writer) throws Exception {
 191
 192      /**
 193       * Generate 10 documents where term n  has a docFreq of n and a totalTermFreq of n*2 (squared).
 194       */
 195     for (int i = 1; i <= 10; i++) {
 196       Document doc = new Document();
 197       String content = getContent(i);
 198
 199       doc.add(newField(random, "FIELD_1", content, Field.Store.YES,Field.Index.ANALYZED, Field.TermVector.NO));
 200       //add a different field
 201       doc.add(newField(random, "different_field", "diff", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
 202       writer.addDocument(doc);
 203     }
 204
 205     //add 10 more docs with the term "diff" this will make it have the highest docFreq if we don't ask for the
 206     //highest freq terms for a specific field.
 207     for (int i = 1; i <= 10; i++) {
 208       Document doc = new Document();
 209       doc.add(newField(random, "different_field", "diff", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
 210       writer.addDocument(doc);
 211     }
 212     // add some docs where tf < df so we can see if sorting works
 213     // highTF low df
 214     int highTF = 200;
 215     Document doc = new Document();
 216     String content = "";
 217     for (int i = 0; i < highTF; i++) {
 218       content += "highTF ";
 219     }
 220     doc.add(newField(random, "FIELD_1", content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
 221     writer.addDocument(doc);
 222     // highTF medium df =5
 223     int medium_df = 5;
 224     for (int i = 0; i < medium_df; i++) {
 225       int tf = 25;
 226       Document newdoc = new Document();
 227       String newcontent = "";
 228       for (int j = 0; j < tf; j++) {
 229         newcontent += "highTFmedDF ";
 230       }
 231       newdoc.add(newField(random, "FIELD_1", newcontent, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
 232       writer.addDocument(newdoc);
 233     }
 234     // add a doc with high tf in field different_field
 235     int targetTF =150;
 236     doc = new Document();
 237     content = "";
 238     for (int i = 0; i < targetTF; i++) {
 239       content += "TF150 ";
 240     }
 241     doc.add(newField(random, "different_field", content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
 242     writer.addDocument(doc);
 243     writer.close();
 244
 245   }
 246
 247 /**
 248  *  getContent
 249  *  return string containing numbers 1 to i with each number n occurring n times.
 250  *  i.e. for input of 3 return string "3 3 3 2 2 1"
 251  */
 252
 253   private static String getContent(int i) {
 254     String s = "";
 255     for (int j = 10; j >= i; j--) {
 256       for (int k = 0; k < j; k++) {
 257         // if j is 3 we return "3 3 3"
 258         s += String.valueOf(j) + " ";
 259       }
 260     }
 261     return s;
 262   }
 263
 264   private static int getExpectedtotalTermFreq(int i) {
 265     return getExpecteddocFreq(i) * i;
 266   }
 267
 268   private static int getExpecteddocFreq(int i) {
 269     return i;
 270   }
 271 }