lucene-java-3.4.0/lucene/src/test/org/apache/lucene/index/TestOmitTf.java

   1 package org.apache.lucene.index;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.Collection;
  22
  23 import org.apache.lucene.util.LuceneTestCase;
  24 import org.apache.lucene.util._TestUtil;
  25 import org.apache.lucene.analysis.Analyzer;
  26 import org.apache.lucene.analysis.MockAnalyzer;
  27 import org.apache.lucene.document.Document;
  28 import org.apache.lucene.document.Field;
  29 import org.apache.lucene.index.FieldInfo.IndexOptions;
  30 import org.apache.lucene.search.*;
  31 import org.apache.lucene.search.BooleanClause.Occur;
  32 import org.apache.lucene.store.Directory;
  33 import org.apache.lucene.search.Explanation.IDFExplanation;
  34
  35
  36 public class TestOmitTf extends LuceneTestCase {
  37
  38   public static class SimpleSimilarity extends Similarity {
  39     @Override public float computeNorm(String field, FieldInvertState state) { return state.getBoost(); }
  40     @Override public float queryNorm(float sumOfSquaredWeights) { return 1.0f; }
  41     @Override public float tf(float freq) { return freq; }
  42     @Override public float sloppyFreq(int distance) { return 2.0f; }
  43     @Override public float idf(int docFreq, int numDocs) { return 1.0f; }
  44     @Override public float coord(int overlap, int maxOverlap) { return 1.0f; }
  45     @Override public IDFExplanation idfExplain(Collection<Term> terms, Searcher searcher) throws IOException {
  46       return new IDFExplanation() {
  47         @Override
  48         public float getIdf() {
  49           return 1.0f;
  50         }
  51         @Override
  52         public String explain() {
  53           return "Inexplicable";
  54         }
  55       };
  56     }
  57   }
  58
  59   // Tests whether the DocumentWriter correctly enable the
  60   // omitTermFreqAndPositions bit in the FieldInfo
  61   public void testOmitTermFreqAndPositions() throws Exception {
  62     Directory ram = newDirectory();
  63     Analyzer analyzer = new MockAnalyzer(random);
  64     IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer));
  65     Document d = new Document();
  66
  67     // this field will have Tf
  68     Field f1 = newField("f1", "This field has term freqs", Field.Store.NO, Field.Index.ANALYZED);
  69     d.add(f1);
  70
  71     // this field will NOT have Tf
  72     Field f2 = newField("f2", "This field has NO Tf in all docs", Field.Store.NO, Field.Index.ANALYZED);
  73     f2.setIndexOptions(IndexOptions.DOCS_ONLY);
  74     d.add(f2);
  75
  76     writer.addDocument(d);
  77     writer.optimize();
  78     // now we add another document which has term freq for field f2 and not for f1 and verify if the SegmentMerger
  79     // keep things constant
  80     d = new Document();
  81
  82     // Reverese
  83     f1.setIndexOptions(IndexOptions.DOCS_ONLY);
  84     d.add(f1);
  85
  86     f2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
  87     d.add(f2);
  88
  89     writer.addDocument(d);
  90     // force merge
  91     writer.optimize();
  92     // flush
  93     writer.close();
  94
  95     SegmentReader reader = SegmentReader.getOnlySegmentReader(ram);
  96     FieldInfos fi = reader.fieldInfos();
  97     assertEquals("OmitTermFreqAndPositions field bit should be set.", IndexOptions.DOCS_ONLY, fi.fieldInfo("f1").indexOptions);
  98     assertEquals("OmitTermFreqAndPositions field bit should be set.", IndexOptions.DOCS_ONLY, fi.fieldInfo("f2").indexOptions);
  99
 100     reader.close();
 101     ram.close();
 102   }
 103
 104   // Tests whether merging of docs that have different
 105   // omitTermFreqAndPositions for the same field works
 106   public void testMixedMerge() throws Exception {
 107     Directory ram = newDirectory();
 108     Analyzer analyzer = new MockAnalyzer(random);
 109     IndexWriter writer = new IndexWriter(
 110         ram,
 111         newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).
 112             setMaxBufferedDocs(3).
 113             setMergePolicy(newLogMergePolicy(2))
 114     );
 115     writer.setInfoStream(VERBOSE ? System.out : null);
 116     Document d = new Document();
 117
 118     // this field will have Tf
 119     Field f1 = newField("f1", "This field has term freqs", Field.Store.NO, Field.Index.ANALYZED);
 120     d.add(f1);
 121
 122     // this field will NOT have Tf
 123     Field f2 = newField("f2", "This field has NO Tf in all docs", Field.Store.NO, Field.Index.ANALYZED);
 124     f2.setIndexOptions(IndexOptions.DOCS_ONLY);
 125     d.add(f2);
 126
 127     for(int i=0;i<30;i++)
 128       writer.addDocument(d);
 129
 130     // now we add another document which has term freq for field f2 and not for f1 and verify if the SegmentMerger
 131     // keep things constant
 132     d = new Document();
 133
 134     // Reverese
 135     f1.setIndexOptions(IndexOptions.DOCS_ONLY);
 136     d.add(f1);
 137
 138     f2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
 139     d.add(f2);
 140
 141     for(int i=0;i<30;i++)
 142       writer.addDocument(d);
 143
 144     // force merge
 145     writer.optimize();
 146     // flush
 147     writer.close();
 148
 149     SegmentReader reader = SegmentReader.getOnlySegmentReader(ram);
 150     FieldInfos fi = reader.fieldInfos();
 151     assertEquals("OmitTermFreqAndPositions field bit should be set.", IndexOptions.DOCS_ONLY, fi.fieldInfo("f1").indexOptions);
 152     assertEquals("OmitTermFreqAndPositions field bit should be set.", IndexOptions.DOCS_ONLY, fi.fieldInfo("f2").indexOptions);
 153
 154     reader.close();
 155     ram.close();
 156   }
 157
 158   // Make sure first adding docs that do not omitTermFreqAndPositions for
 159   // field X, then adding docs that do omitTermFreqAndPositions for that same
 160   // field,
 161   public void testMixedRAM() throws Exception {
 162     Directory ram = newDirectory();
 163     Analyzer analyzer = new MockAnalyzer(random);
 164     IndexWriter writer = new IndexWriter(
 165         ram,
 166         newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).
 167             setMaxBufferedDocs(10).
 168             setMergePolicy(newLogMergePolicy(2))
 169     );
 170     Document d = new Document();
 171
 172     // this field will have Tf
 173     Field f1 = newField("f1", "This field has term freqs", Field.Store.NO, Field.Index.ANALYZED);
 174     d.add(f1);
 175
 176     // this field will NOT have Tf
 177     Field f2 = newField("f2", "This field has NO Tf in all docs", Field.Store.NO, Field.Index.ANALYZED);
 178     d.add(f2);
 179
 180     for(int i=0;i<5;i++)
 181       writer.addDocument(d);
 182
 183     f2.setIndexOptions(IndexOptions.DOCS_ONLY);
 184
 185     for(int i=0;i<20;i++)
 186       writer.addDocument(d);
 187
 188     // force merge
 189     writer.optimize();
 190
 191     // flush
 192     writer.close();
 193
 194     SegmentReader reader = SegmentReader.getOnlySegmentReader(ram);
 195     FieldInfos fi = reader.fieldInfos();
 196     assertEquals("OmitTermFreqAndPositions field bit should not be set.", IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, fi.fieldInfo("f1").indexOptions);
 197     assertEquals("OmitTermFreqAndPositions field bit should be set.", IndexOptions.DOCS_ONLY, fi.fieldInfo("f2").indexOptions);
 198
 199     reader.close();
 200     ram.close();
 201   }
 202
 203   private void assertNoPrx(Directory dir) throws Throwable {
 204     final String[] files = dir.listAll();
 205     for(int i=0;i<files.length;i++) {
 206       assertFalse(files[i].endsWith(".prx"));
 207       assertFalse(files[i].endsWith(".pos"));
 208     }
 209   }
 210
 211   // Verifies no *.prx exists when all fields omit term freq:
 212   public void testNoPrxFile() throws Throwable {
 213     Directory ram = newDirectory();
 214     Analyzer analyzer = new MockAnalyzer(random);
 215     IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig(
 216                                                                    TEST_VERSION_CURRENT, analyzer).setMaxBufferedDocs(3).setMergePolicy(newLogMergePolicy()));
 217     LogMergePolicy lmp = (LogMergePolicy) writer.getConfig().getMergePolicy();
 218     lmp.setMergeFactor(2);
 219     lmp.setUseCompoundFile(false);
 220     Document d = new Document();
 221
 222     Field f1 = newField("f1", "This field has no term freqs", Field.Store.NO, Field.Index.ANALYZED);
 223     f1.setIndexOptions(IndexOptions.DOCS_ONLY);
 224     d.add(f1);
 225
 226     for(int i=0;i<30;i++)
 227       writer.addDocument(d);
 228
 229     writer.commit();
 230
 231     assertNoPrx(ram);
 232
 233     // now add some documents with positions, and check there is no prox after optimization
 234     d = new Document();
 235     f1 = newField("f1", "This field has positions", Field.Store.NO, Field.Index.ANALYZED);
 236     d.add(f1);
 237
 238     for(int i=0;i<30;i++)
 239       writer.addDocument(d);
 240
 241     // force merge
 242     writer.optimize();
 243     // flush
 244     writer.close();
 245
 246     assertNoPrx(ram);
 247     ram.close();
 248   }
 249
 250   // Test scores with one field with Term Freqs and one without, otherwise with equal content
 251   public void testBasic() throws Exception {
 252     Directory dir = newDirectory();
 253     Analyzer analyzer = new MockAnalyzer(random);
 254     IndexWriter writer = new IndexWriter(
 255         dir,
 256         newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).
 257             setMaxBufferedDocs(2).
 258             setSimilarity(new SimpleSimilarity()).
 259             setMergePolicy(newLogMergePolicy(2))
 260     );
 261     writer.setInfoStream(VERBOSE ? System.out : null);
 262
 263     StringBuilder sb = new StringBuilder(265);
 264     String term = "term";
 265     for(int i = 0; i<30; i++){
 266       Document d = new Document();
 267       sb.append(term).append(" ");
 268       String content  = sb.toString();
 269       Field noTf = newField("noTf", content + (i%2==0 ? "" : " notf"), Field.Store.NO, Field.Index.ANALYZED);
 270       noTf.setIndexOptions(IndexOptions.DOCS_ONLY);
 271       d.add(noTf);
 272
 273       Field tf = newField("tf", content + (i%2==0 ? " tf" : ""), Field.Store.NO, Field.Index.ANALYZED);
 274       d.add(tf);
 275
 276       writer.addDocument(d);
 277       //System.out.println(d);
 278     }
 279
 280     writer.optimize();
 281     // flush
 282     writer.close();
 283
 284     /*
 285      * Verify the index
 286      */
 287     Searcher searcher = new IndexSearcher(dir, true);
 288     searcher.setSimilarity(new SimpleSimilarity());
 289
 290     Term a = new Term("noTf", term);
 291     Term b = new Term("tf", term);
 292     Term c = new Term("noTf", "notf");
 293     Term d = new Term("tf", "tf");
 294     TermQuery q1 = new TermQuery(a);
 295     TermQuery q2 = new TermQuery(b);
 296     TermQuery q3 = new TermQuery(c);
 297     TermQuery q4 = new TermQuery(d);
 298
 299
 300     searcher.search(q1,
 301                     new CountingHitCollector() {
 302                       private Scorer scorer;
 303                       @Override
 304                       public final void setScorer(Scorer scorer) {
 305                         this.scorer = scorer;
 306                       }
 307                       @Override
 308                       public final void collect(int doc) throws IOException {
 309                         //System.out.println("Q1: Doc=" + doc + " score=" + score);
 310                         float score = scorer.score();
 311                         assertTrue(score==1.0f);
 312                         super.collect(doc);
 313                       }
 314                     });
 315     //System.out.println(CountingHitCollector.getCount());
 316
 317
 318     searcher.search(q2,
 319                     new CountingHitCollector() {
 320                       private Scorer scorer;
 321                       @Override
 322                       public final void setScorer(Scorer scorer) {
 323                         this.scorer = scorer;
 324                       }
 325                       @Override
 326                       public final void collect(int doc) throws IOException {
 327                         //System.out.println("Q2: Doc=" + doc + " score=" + score);
 328                         float score = scorer.score();
 329                         assertEquals(1.0f+doc, score, 0.00001f);
 330                         super.collect(doc);
 331                       }
 332                     });
 333     //System.out.println(CountingHitCollector.getCount());
 334
 335
 336
 337
 338
 339     searcher.search(q3,
 340                     new CountingHitCollector() {
 341                       private Scorer scorer;
 342                       @Override
 343                       public final void setScorer(Scorer scorer) {
 344                         this.scorer = scorer;
 345                       }
 346                       @Override
 347                       public final void collect(int doc) throws IOException {
 348                         //System.out.println("Q1: Doc=" + doc + " score=" + score);
 349                         float score = scorer.score();
 350                         assertTrue(score==1.0f);
 351                         assertFalse(doc%2==0);
 352                         super.collect(doc);
 353                       }
 354                     });
 355     //System.out.println(CountingHitCollector.getCount());
 356
 357
 358     searcher.search(q4,
 359                     new CountingHitCollector() {
 360                       private Scorer scorer;
 361                       @Override
 362                       public final void setScorer(Scorer scorer) {
 363                         this.scorer = scorer;
 364                       }
 365                       @Override
 366                       public final void collect(int doc) throws IOException {
 367                         float score = scorer.score();
 368                         //System.out.println("Q1: Doc=" + doc + " score=" + score);
 369                         assertTrue(score==1.0f);
 370                         assertTrue(doc%2==0);
 371                         super.collect(doc);
 372                       }
 373                     });
 374     //System.out.println(CountingHitCollector.getCount());
 375
 376
 377
 378     BooleanQuery bq = new BooleanQuery();
 379     bq.add(q1,Occur.MUST);
 380     bq.add(q4,Occur.MUST);
 381
 382     searcher.search(bq,
 383                     new CountingHitCollector() {
 384                       @Override
 385                       public final void collect(int doc) throws IOException {
 386                         //System.out.println("BQ: Doc=" + doc + " score=" + score);
 387                         super.collect(doc);
 388                       }
 389                     });
 390     assertTrue(15 == CountingHitCollector.getCount());
 391
 392     searcher.close();
 393     dir.close();
 394   }
 395
 396   public static class CountingHitCollector extends Collector {
 397     static int count=0;
 398     static int sum=0;
 399     private int docBase = -1;
 400     CountingHitCollector(){count=0;sum=0;}
 401     @Override
 402     public void setScorer(Scorer scorer) throws IOException {}
 403     @Override
 404     public void collect(int doc) throws IOException {
 405       count++;
 406       sum += doc + docBase;  // use it to avoid any possibility of being optimized away
 407     }
 408
 409     public static int getCount() { return count; }
 410     public static int getSum() { return sum; }
 411
 412     @Override
 413     public void setNextReader(IndexReader reader, int docBase) {
 414       this.docBase = docBase;
 415     }
 416     @Override
 417     public boolean acceptsDocsOutOfOrder() {
 418       return true;
 419     }
 420   }
 421 }