1 package org.apache.lucene.misc;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.index.IndexReader;
21 import org.apache.lucene.index.IndexWriter;
22 import org.apache.lucene.index.Term;
23 import org.apache.lucene.util.LuceneTestCase;
24 import org.apache.lucene.store.Directory;
25 import org.apache.lucene.analysis.MockAnalyzer;
26 import org.apache.lucene.analysis.MockTokenizer;
27 import org.apache.lucene.document.Document;
28 import org.apache.lucene.document.Field;
29 import org.junit.AfterClass;
30 import org.junit.BeforeClass;
32 public class TestHighFreqTerms extends LuceneTestCase {
34 private static IndexWriter writer =null;
35 private static Directory dir = null;
36 private static IndexReader reader =null;
39 public static void setUpClass() throws Exception {
41 writer = new IndexWriter(dir, newIndexWriterConfig(random,
42 TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false))
43 .setMaxBufferedDocs(2));
45 reader = IndexReader.open(dir, true);
49 public static void tearDownClass() throws Exception{
56 /******************** Tests for getHighFreqTerms **********************************/
58 // test without specifying field (i.e. if we pass in field=null it should examine all fields)
59 // the term "diff" in the field "different_field" occurs 20 times and is the highest df term
60 public void testFirstTermHighestDocFreqAllFields () throws Exception{
63 TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
64 assertEquals("Term with highest docfreq is first", 20,terms[0].docFreq );
67 public void testFirstTermHighestDocFreq () throws Exception{
69 String field="FIELD_1";
70 TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
71 assertEquals("Term with highest docfreq is first", 10,terms[0].docFreq );
73 public void testOrderedByDocFreqDescending () throws Exception{
75 String field="FIELD_1";
76 TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
77 for (int i = 0; i < terms.length; i++) {
79 assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
84 public void testNumTerms () throws Exception{
87 TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
88 assertEquals("length of terms array equals numTerms :" + numTerms, numTerms, terms.length);
91 public void testGetHighFreqTerms () throws Exception{
93 String field="FIELD_1";
94 TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
96 for (int i = 0; i < terms.length; i++) {
97 String termtext = terms[i].term.text();
98 // hardcoded highTF or highTFmedDF
99 if (termtext.contains("highTF")) {
100 if (termtext.contains("medDF")) {
101 assertEquals("doc freq is not as expected", 5, terms[i].docFreq);
103 assertEquals("doc freq is not as expected", 1, terms[i].docFreq);
106 int n = Integer.parseInt(termtext);
107 assertEquals("doc freq is not as expected", getExpecteddocFreq(n),
113 /********************Test sortByTotalTermFreq**********************************/
115 public void testFirstTermHighestTotalTermFreq () throws Exception{
118 TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
119 TermStats[] termsWithTotalTermFreq = HighFreqTerms.sortByTotalTermFreq(reader, terms);
120 assertEquals("Term with highest totalTermFreq is first",200, termsWithTotalTermFreq[0].totalTermFreq);
122 public void testFirstTermHighestTotalTermFreqDifferentField () throws Exception{
124 String field = "different_field";
125 TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
126 TermStats[] termsWithTotalTermFreq = HighFreqTerms.sortByTotalTermFreq(reader, terms);
127 assertEquals("Term with highest totalTermFreq is first"+ termsWithTotalTermFreq[0].term.text(),150, termsWithTotalTermFreq[0].totalTermFreq);
130 public void testOrderedByTermFreqDescending () throws Exception{
132 String field = "FIELD_1";
133 TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
134 TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
136 for (int i = 0; i < termsWithTF.length; i++) {
137 // check that they are sorted by descending termfreq order
139 assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq > termsWithTF[i].totalTermFreq);
144 public void testGetTermFreqOrdered () throws Exception{
146 String field = "FIELD_1";
147 TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
148 TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
150 for (int i = 0; i < termsWithTF.length; i++) {
151 String text = termsWithTF[i].term.text();
152 if (text.contains("highTF")) {
153 if (text.contains("medDF")) {
154 assertEquals("total term freq is expected", 125,
155 termsWithTF[i].totalTermFreq);
157 assertEquals("total term freq is expected", 200,
158 termsWithTF[i].totalTermFreq);
162 int n = Integer.parseInt(text);
163 assertEquals("doc freq is expected", getExpecteddocFreq(n),
164 termsWithTF[i].docFreq);
165 assertEquals("total term freq is expected", getExpectedtotalTermFreq(n),
166 termsWithTF[i].totalTermFreq);
171 /********************Tests for getTotalTermFreq**********************************/
173 public void testGetTotalTermFreq() throws Exception{
174 String termtext ="highTF";
175 String field = "FIELD_1";
176 Term term = new Term(field,termtext);
177 long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, term);
178 assertEquals("highTf tf should be 200",200,totalTermFreq);
181 public void testGetTotalTermFreqBadTerm() throws Exception{
182 String termtext ="foobar";
183 String field = "FIELD_1";
184 Term term = new Term(field,termtext);
185 long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, term);
186 assertEquals("totalTermFreq should be 0 for term not in index",0,totalTermFreq);
188 /********************Testing Utils**********************************/
190 private static void indexDocs(IndexWriter writer) throws Exception {
193 * Generate 10 documents where term n has a docFreq of n and a totalTermFreq of n*2 (squared).
195 for (int i = 1; i <= 10; i++) {
196 Document doc = new Document();
197 String content = getContent(i);
199 doc.add(newField(random, "FIELD_1", content, Field.Store.YES,Field.Index.ANALYZED, Field.TermVector.NO));
200 //add a different field
201 doc.add(newField(random, "different_field", "diff", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
202 writer.addDocument(doc);
205 //add 10 more docs with the term "diff" this will make it have the highest docFreq if we don't ask for the
206 //highest freq terms for a specific field.
207 for (int i = 1; i <= 10; i++) {
208 Document doc = new Document();
209 doc.add(newField(random, "different_field", "diff", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
210 writer.addDocument(doc);
212 // add some docs where tf < df so we can see if sorting works
215 Document doc = new Document();
217 for (int i = 0; i < highTF; i++) {
218 content += "highTF ";
220 doc.add(newField(random, "FIELD_1", content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
221 writer.addDocument(doc);
222 // highTF medium df =5
224 for (int i = 0; i < medium_df; i++) {
226 Document newdoc = new Document();
227 String newcontent = "";
228 for (int j = 0; j < tf; j++) {
229 newcontent += "highTFmedDF ";
231 newdoc.add(newField(random, "FIELD_1", newcontent, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
232 writer.addDocument(newdoc);
234 // add a doc with high tf in field different_field
236 doc = new Document();
238 for (int i = 0; i < targetTF; i++) {
241 doc.add(newField(random, "different_field", content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
242 writer.addDocument(doc);
249 * return string containing numbers 1 to i with each number n occurring n times.
250 * i.e. for input of 3 return string "3 3 3 2 2 1"
253 private static String getContent(int i) {
255 for (int j = 10; j >= i; j--) {
256 for (int k = 0; k < j; k++) {
257 // if j is 3 we return "3 3 3"
258 s += String.valueOf(j) + " ";
264 private static int getExpectedtotalTermFreq(int i) {
265 return getExpecteddocFreq(i) * i;
268 private static int getExpecteddocFreq(int i) {