lucene-java-3.4.0/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 package org.apache.lucene.benchmark.byTask;
  19
  20 import java.io.BufferedReader;
  21 import java.io.File;
  22 import java.io.FileReader;
  23 import java.io.StringReader;
  24 import java.text.Collator;
  25 import java.util.List;
  26 import java.util.Locale;
  27
  28 import org.apache.lucene.analysis.Analyzer;
  29 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  30 import org.apache.lucene.analysis.MockAnalyzer;
  31 import org.apache.lucene.analysis.TokenStream;
  32 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  33 import org.apache.lucene.benchmark.BenchmarkTestCase;
  34 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
  35 import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
  36 import org.apache.lucene.benchmark.byTask.stats.TaskStats;
  37 import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
  38 import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
  39 import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
  40 import org.apache.lucene.collation.CollationKeyAnalyzer;
  41 import org.apache.lucene.index.IndexReader;
  42 import org.apache.lucene.index.IndexWriter;
  43 import org.apache.lucene.index.IndexWriterConfig;
  44 import org.apache.lucene.index.LogDocMergePolicy;
  45 import org.apache.lucene.index.LogMergePolicy;
  46 import org.apache.lucene.index.Term;
  47 import org.apache.lucene.index.TermEnum;
  48 import org.apache.lucene.index.TermDocs;
  49 import org.apache.lucene.index.SegmentInfos;
  50 import org.apache.lucene.index.SerialMergeScheduler;
  51 import org.apache.lucene.index.TermFreqVector;
  52 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
  53 import org.apache.lucene.store.Directory;
  54 import org.apache.lucene.search.FieldCache.StringIndex;
  55 import org.apache.lucene.search.FieldCache;
  56
  57 /**
  58  * Test very simply that perf tasks - simple algorithms - are doing what they should.
  59  */
  60 public class TestPerfTasksLogic extends BenchmarkTestCase {
  61
  62   @Override
  63   public void setUp() throws Exception {
  64     super.setUp();
  65     copyToWorkDir("reuters.first20.lines.txt");
  66   }
  67
  68   /**
  69    * Test index creation logic
  70    */
  71   public void testIndexAndSearchTasks() throws Exception {
  72     // 1. alg definition (required in every "logic" test)
  73     String algLines[] = {
  74         "ResetSystemErase",
  75         "CreateIndex",
  76         "{ AddDoc } : 1000",
  77         "Optimize",
  78         "CloseIndex",
  79         "OpenReader",
  80         "{ CountingSearchTest } : 200",
  81         "CloseReader",
  82         "[ CountingSearchTest > : 70",
  83         "[ CountingSearchTest > : 9",
  84     };
  85
  86     // 2. we test this value later
  87     CountingSearchTestTask.numSearches = 0;
  88
  89     // 3. execute the algorithm  (required in every "logic" test)
  90     Benchmark benchmark = execBenchmark(algLines);
  91
  92     // 4. test specific checks after the benchmark run completed.
  93     assertEquals("TestSearchTask was supposed to be called!",279,CountingSearchTestTask.numSearches);
  94     assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
  95     // now we should be able to open the index for write.
  96     IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),
  97         new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
  98             .setOpenMode(OpenMode.APPEND));
  99     iw.close();
 100     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 101     assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
 102     ir.close();
 103   }
 104
 105   /**
 106    * Test timed sequence task.
 107    */
 108   public void testTimedSearchTask() throws Exception {
 109     String algLines[] = {
 110         "log.step=100000",
 111         "ResetSystemErase",
 112         "CreateIndex",
 113         "{ AddDoc } : 100",
 114         "Optimize",
 115         "CloseIndex",
 116         "OpenReader",
 117         "{ CountingSearchTest } : .5s",
 118         "CloseReader",
 119     };
 120
 121     CountingSearchTestTask.numSearches = 0;
 122     execBenchmark(algLines);
 123     assertTrue(CountingSearchTestTask.numSearches > 0);
 124     long elapsed = CountingSearchTestTask.prevLastMillis - CountingSearchTestTask.startMillis;
 125     assertTrue("elapsed time was " + elapsed + " msec", elapsed <= 1500);
 126   }
 127
 128   // disabled until we fix BG thread prio -- this test
 129   // causes build to hang
 130   public void testBGSearchTaskThreads() throws Exception {
 131     String algLines[] = {
 132         "log.time.step.msec = 100",
 133         "log.step=100000",
 134         "ResetSystemErase",
 135         "CreateIndex",
 136         "{ AddDoc } : 1000",
 137         "Optimize",
 138         "CloseIndex",
 139         "OpenReader",
 140         "{",
 141         "  [ \"XSearch\" { CountingSearchTest > : * ] : 2 &-1",
 142         "  Wait(0.5)",
 143         "}",
 144         "CloseReader",
 145         "RepSumByPref X"
 146     };
 147
 148     CountingSearchTestTask.numSearches = 0;
 149     execBenchmark(algLines);
 150     assertTrue(CountingSearchTestTask.numSearches > 0);
 151   }
 152
 153   public void testHighlighting() throws Exception {
 154     // 1. alg definition (required in every "logic" test)
 155     String algLines[] = {
 156         "doc.stored=true",
 157         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 158         "docs.file=" + getReuters20LinesFile(),
 159         "query.maker=" + ReutersQueryMaker.class.getName(),
 160         "ResetSystemErase",
 161         "CreateIndex",
 162         "{ AddDoc } : 100",
 163         "Optimize",
 164         "CloseIndex",
 165         "OpenReader(true)",
 166         "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
 167         "CloseReader",
 168     };
 169
 170     // 2. we test this value later
 171     CountingHighlighterTestTask.numHighlightedResults = 0;
 172     CountingHighlighterTestTask.numDocsRetrieved = 0;
 173     // 3. execute the algorithm  (required in every "logic" test)
 174     Benchmark benchmark = execBenchmark(algLines);
 175
 176     // 4. test specific checks after the benchmark run completed.
 177     assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved);
 178     //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
 179     //we probably should use a different doc/query maker, but...
 180     assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
 181
 182     assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
 183     // now we should be able to open the index for write.
 184     IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
 185     iw.close();
 186     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 187     assertEquals("100 docs were added to the index, this is what we expect to find!",100,ir.numDocs());
 188     ir.close();
 189   }
 190
 191   public void testHighlightingTV() throws Exception {
 192     // 1. alg definition (required in every "logic" test)
 193     String algLines[] = {
 194         "doc.stored=true",//doc storage is required in order to have text to highlight
 195         "doc.term.vector.offsets=true",
 196         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 197         "docs.file=" + getReuters20LinesFile(),
 198         "query.maker=" + ReutersQueryMaker.class.getName(),
 199         "ResetSystemErase",
 200         "CreateIndex",
 201         "{ AddDoc } : 1000",
 202         "Optimize",
 203         "CloseIndex",
 204         "OpenReader(false)",
 205         "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
 206         "CloseReader",
 207     };
 208
 209     // 2. we test this value later
 210     CountingHighlighterTestTask.numHighlightedResults = 0;
 211     CountingHighlighterTestTask.numDocsRetrieved = 0;
 212     // 3. execute the algorithm  (required in every "logic" test)
 213     Benchmark benchmark = execBenchmark(algLines);
 214
 215     // 4. test specific checks after the benchmark run completed.
 216     assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved);
 217     //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
 218     //we probably should use a different doc/query maker, but...
 219     assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
 220
 221     assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
 222     // now we should be able to open the index for write.
 223     IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
 224     iw.close();
 225     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 226     assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
 227     ir.close();
 228   }
 229
 230   public void testHighlightingNoTvNoStore() throws Exception {
 231     // 1. alg definition (required in every "logic" test)
 232     String algLines[] = {
 233         "doc.stored=false",
 234         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 235         "docs.file=" + getReuters20LinesFile(),
 236         "query.maker=" + ReutersQueryMaker.class.getName(),
 237         "ResetSystemErase",
 238         "CreateIndex",
 239         "{ AddDoc } : 1000",
 240         "Optimize",
 241         "CloseIndex",
 242         "OpenReader",
 243         "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
 244         "CloseReader",
 245     };
 246
 247     // 2. we test this value later
 248     CountingHighlighterTestTask.numHighlightedResults = 0;
 249     CountingHighlighterTestTask.numDocsRetrieved = 0;
 250     // 3. execute the algorithm  (required in every "logic" test)
 251     try {
 252       Benchmark benchmark = execBenchmark(algLines);
 253       assertTrue("CountingHighlighterTest should have thrown an exception", false);
 254       assertNotNull(benchmark); // (avoid compile warning on unused variable)
 255     } catch (Exception e) {
 256       assertTrue(true);
 257     }
 258   }
 259
 260   /**
 261    * Test Exhasting Doc Maker logic
 262    */
 263   public void testExhaustContentSource() throws Exception {
 264     // 1. alg definition (required in every "logic" test)
 265     String algLines[] = {
 266         "# ----- properties ",
 267         "content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource",
 268         "content.source.log.step=1",
 269         "doc.term.vector=false",
 270         "content.source.forever=false",
 271         "directory=RAMDirectory",
 272         "doc.stored=false",
 273         "doc.tokenized=false",
 274         "# ----- alg ",
 275         "CreateIndex",
 276         "{ AddDoc } : * ",
 277         "Optimize",
 278         "CloseIndex",
 279         "OpenReader",
 280         "{ CountingSearchTest } : 100",
 281         "CloseReader",
 282         "[ CountingSearchTest > : 30",
 283         "[ CountingSearchTest > : 9",
 284     };
 285
 286     // 2. we test this value later
 287     CountingSearchTestTask.numSearches = 0;
 288
 289     // 3. execute the algorithm  (required in every "logic" test)
 290     Benchmark benchmark = execBenchmark(algLines);
 291
 292     // 4. test specific checks after the benchmark run completed.
 293     assertEquals("TestSearchTask was supposed to be called!",139,CountingSearchTestTask.numSearches);
 294     assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
 295     // now we should be able to open the index for write.
 296     IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
 297     iw.close();
 298     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 299     assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs());
 300     ir.close();
 301   }
 302
 303   // LUCENE-1994: test thread safety of SortableSingleDocMaker
 304   public void testDocMakerThreadSafety() throws Exception {
 305     // 1. alg definition (required in every "logic" test)
 306     String algLines[] = {
 307         "# ----- properties ",
 308         "content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource",
 309         "doc.term.vector=false",
 310         "log.step.AddDoc=10000",
 311         "content.source.forever=true",
 312         "directory=RAMDirectory",
 313         "doc.reuse.fields=false",
 314         "doc.stored=false",
 315         "doc.tokenized=false",
 316         "doc.index.props=true",
 317         "# ----- alg ",
 318         "CreateIndex",
 319         "[ { AddDoc > : 250 ] : 4",
 320         "CloseIndex",
 321     };
 322
 323     // 2. we test this value later
 324     CountingSearchTestTask.numSearches = 0;
 325
 326     // 3. execute the algorithm  (required in every "logic" test)
 327     Benchmark benchmark = execBenchmark(algLines);
 328
 329     IndexReader r = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 330     StringIndex idx = FieldCache.DEFAULT.getStringIndex(r, "country");
 331     final int maxDoc = r.maxDoc();
 332     assertEquals(1000, maxDoc);
 333     for(int i=0;i<1000;i++) {
 334       assertNotNull("doc " + i + " has null country", idx.lookup[idx.order[i]]);
 335     }
 336     r.close();
 337   }
 338
 339   /**
 340    * Test Parallel Doc Maker logic (for LUCENE-940)
 341    */
 342   public void testParallelDocMaker() throws Exception {
 343     // 1. alg definition (required in every "logic" test)
 344     String algLines[] = {
 345         "# ----- properties ",
 346         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 347         "docs.file=" + getReuters20LinesFile(),
 348         "content.source.log.step=3",
 349         "doc.term.vector=false",
 350         "content.source.forever=false",
 351         "directory=FSDirectory",
 352         "doc.stored=false",
 353         "doc.tokenized=false",
 354         "# ----- alg ",
 355         "CreateIndex",
 356         "[ { AddDoc } : * ] : 4 ",
 357         "CloseIndex",
 358     };
 359
 360     // 2. execute the algorithm  (required in every "logic" test)
 361     Benchmark benchmark = execBenchmark(algLines);
 362
 363     // 3. test number of docs in the index
 364     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 365     int ndocsExpected = 20; // first 20 reuters docs.
 366     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
 367     ir.close();
 368   }
 369
 370   /**
 371    * Test WriteLineDoc and LineDocSource.
 372    */
 373   public void testLineDocFile() throws Exception {
 374     File lineFile = new File(TEMP_DIR, "test.reuters.lines.txt");
 375
 376     // We will call WriteLineDocs this many times
 377     final int NUM_TRY_DOCS = 50;
 378
 379     // Creates a line file with first 50 docs from SingleDocSource
 380     String algLines1[] = {
 381       "# ----- properties ",
 382       "content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource",
 383       "content.source.forever=true",
 384       "line.file.out=" + lineFile.getAbsolutePath().replace('\\', '/'),
 385       "# ----- alg ",
 386       "{WriteLineDoc()}:" + NUM_TRY_DOCS,
 387     };
 388
 389     // Run algo
 390     Benchmark benchmark = execBenchmark(algLines1);
 391
 392     BufferedReader r = new BufferedReader(new FileReader(lineFile));
 393     int numLines = 0;
 394     String line;
 395     while((line = r.readLine()) != null) {
 396       if (numLines==0 && line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR)) {
 397         continue; // do not count the header line as a doc
 398       }
 399       numLines++;
 400     }
 401     r.close();
 402     assertEquals("did not see the right number of docs; should be " + NUM_TRY_DOCS + " but was " + numLines, NUM_TRY_DOCS, numLines);
 403
 404     // Index the line docs
 405     String algLines2[] = {
 406       "# ----- properties ",
 407       "analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer",
 408       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 409       "docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'),
 410       "content.source.forever=false",
 411       "doc.reuse.fields=false",
 412       "ram.flush.mb=4",
 413       "# ----- alg ",
 414       "ResetSystemErase",
 415       "CreateIndex",
 416       "{AddDoc}: *",
 417       "CloseIndex",
 418     };
 419
 420     // Run algo
 421     benchmark = execBenchmark(algLines2);
 422
 423     // now we should be able to open the index for write.
 424     IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),
 425         new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
 426             .setOpenMode(OpenMode.APPEND));
 427     iw.close();
 428
 429     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 430     assertEquals(numLines + " lines were created but " + ir.numDocs() + " docs are in the index", numLines, ir.numDocs());
 431     ir.close();
 432
 433     lineFile.delete();
 434   }
 435
 436   /**
 437    * Test ReadTokensTask
 438    */
 439   public void testReadTokens() throws Exception {
 440
 441     // We will call ReadTokens on this many docs
 442     final int NUM_DOCS = 20;
 443
 444     // Read tokens from first NUM_DOCS docs from Reuters and
 445     // then build index from the same docs
 446     String algLines1[] = {
 447       "# ----- properties ",
 448       "analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer",
 449       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 450       "docs.file=" + getReuters20LinesFile(),
 451       "# ----- alg ",
 452       "{ReadTokens}: " + NUM_DOCS,
 453       "ResetSystemErase",
 454       "CreateIndex",
 455       "{AddDoc}: " + NUM_DOCS,
 456       "CloseIndex",
 457     };
 458
 459     // Run algo
 460     Benchmark benchmark = execBenchmark(algLines1);
 461
 462     List<TaskStats> stats = benchmark.getRunData().getPoints().taskStats();
 463
 464     // Count how many tokens all ReadTokens saw
 465     int totalTokenCount1 = 0;
 466     for (final TaskStats stat : stats) {
 467       if (stat.getTask().getName().equals("ReadTokens")) {
 468         totalTokenCount1 += stat.getCount();
 469       }
 470     }
 471
 472     // Separately count how many tokens are actually in the index:
 473     IndexReader reader = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 474     assertEquals(NUM_DOCS, reader.numDocs());
 475
 476     TermEnum terms = reader.terms();
 477     TermDocs termDocs = reader.termDocs();
 478     int totalTokenCount2 = 0;
 479     while(terms.next()) {
 480       Term term = terms.term();
 481       /* not-tokenized, but indexed field */
 482       if (term != null && term.field() != DocMaker.ID_FIELD && term.field() != DocMaker.DATE_MSEC_FIELD && term.field() != DocMaker.TIME_SEC_FIELD) {
 483           termDocs.seek(terms.term());
 484         while (termDocs.next())
 485           totalTokenCount2 += termDocs.freq();
 486       }
 487     }
 488     reader.close();
 489
 490     // Make sure they are the same
 491     assertEquals(totalTokenCount1, totalTokenCount2);
 492   }
 493
 494   /**
 495    * Test that " {[AddDoc(4000)]: 4} : * " works corrcetly (for LUCENE-941)
 496    */
 497   public void testParallelExhausted() throws Exception {
 498     // 1. alg definition (required in every "logic" test)
 499     String algLines[] = {
 500         "# ----- properties ",
 501         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 502         "docs.file=" + getReuters20LinesFile(),
 503         "content.source.log.step=3",
 504         "doc.term.vector=false",
 505         "content.source.forever=false",
 506         "directory=RAMDirectory",
 507         "doc.stored=false",
 508         "doc.tokenized=false",
 509         "task.max.depth.log=1",
 510         "# ----- alg ",
 511         "CreateIndex",
 512         "{ [ AddDoc]: 4} : * ",
 513         "ResetInputs ",
 514         "{ [ AddDoc]: 4} : * ",
 515         "WaitForMerges",
 516         "CloseIndex",
 517     };
 518
 519     // 2. execute the algorithm  (required in every "logic" test)
 520     Benchmark benchmark = execBenchmark(algLines);
 521
 522     // 3. test number of docs in the index
 523     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 524     int ndocsExpected = 2 * 20; // first 20 reuters docs.
 525     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
 526     ir.close();
 527   }
 528
 529
 530   /**
 531    * Test that exhaust in loop works as expected (LUCENE-1115).
 532    */
 533   public void testExhaustedLooped() throws Exception {
 534     // 1. alg definition (required in every "logic" test)
 535     String algLines[] = {
 536         "# ----- properties ",
 537         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 538         "docs.file=" + getReuters20LinesFile(),
 539         "content.source.log.step=3",
 540         "doc.term.vector=false",
 541         "content.source.forever=false",
 542         "directory=RAMDirectory",
 543         "doc.stored=false",
 544         "doc.tokenized=false",
 545         "task.max.depth.log=1",
 546         "# ----- alg ",
 547         "{ \"Rounds\"",
 548         "  ResetSystemErase",
 549         "  CreateIndex",
 550         "  { \"AddDocs\"  AddDoc > : * ",
 551         "  WaitForMerges",
 552         "  CloseIndex",
 553         "} : 2",
 554     };
 555
 556     // 2. execute the algorithm  (required in every "logic" test)
 557     Benchmark benchmark = execBenchmark(algLines);
 558
 559     // 3. test number of docs in the index
 560     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 561     int ndocsExpected = 20;  // first 20 reuters docs.
 562     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
 563     ir.close();
 564   }
 565
 566   /**
 567    * Test that we can close IndexWriter with argument "false".
 568    */
 569   public void testCloseIndexFalse() throws Exception {
 570     // 1. alg definition (required in every "logic" test)
 571     String algLines[] = {
 572         "# ----- properties ",
 573         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 574         "docs.file=" + getReuters20LinesFile(),
 575         "ram.flush.mb=-1",
 576         "max.buffered=2",
 577         "content.source.log.step=3",
 578         "doc.term.vector=false",
 579         "content.source.forever=false",
 580         "directory=RAMDirectory",
 581         "doc.stored=false",
 582         "doc.tokenized=false",
 583         "debug.level=1",
 584         "# ----- alg ",
 585         "{ \"Rounds\"",
 586         "  ResetSystemErase",
 587         "  CreateIndex",
 588         "  { \"AddDocs\"  AddDoc > : * ",
 589         "  CloseIndex(false)",
 590         "} : 2",
 591     };
 592
 593     // 2. execute the algorithm  (required in every "logic" test)
 594     Benchmark benchmark = execBenchmark(algLines);
 595
 596     // 3. test number of docs in the index
 597     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 598     int ndocsExpected = 20; // first 20 reuters docs.
 599     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
 600     ir.close();
 601   }
 602
 603   public static class MyMergeScheduler extends SerialMergeScheduler {
 604     boolean called;
 605     public MyMergeScheduler() {
 606       super();
 607       called = true;
 608     }
 609   }
 610
 611   public void testDeleteByPercent() throws Exception {
 612     // 1. alg definition (required in every "logic" test)
 613     String algLines[] = {
 614         "# ----- properties ",
 615         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 616         "docs.file=" + getReuters20LinesFile(),
 617         "ram.flush.mb=-1",
 618         "max.buffered=2",
 619         "content.source.log.step=3",
 620         "doc.term.vector=false",
 621         "content.source.forever=false",
 622         "directory=RAMDirectory",
 623         "doc.stored=false",
 624         "doc.tokenized=false",
 625         "debug.level=1",
 626         "# ----- alg ",
 627         "CreateIndex",
 628         "{ \"AddDocs\"  AddDoc > : * ",
 629         "CloseIndex()",
 630         "OpenReader(false)",
 631         "DeleteByPercent(20)",
 632         "CloseReader"
 633     };
 634
 635     // 2. execute the algorithm  (required in every "logic" test)
 636     Benchmark benchmark = execBenchmark(algLines);
 637
 638     // 3. test number of docs in the index
 639     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 640     int ndocsExpected = 16; // first 20 reuters docs, minus 20%
 641     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
 642     ir.close();
 643   }
 644
 645   /**
 646    * Test that we can set merge scheduler".
 647    */
 648   public void testMergeScheduler() throws Exception {
 649     // 1. alg definition (required in every "logic" test)
 650     String algLines[] = {
 651         "# ----- properties ",
 652         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 653         "docs.file=" + getReuters20LinesFile(),
 654         "content.source.log.step=3",
 655         "doc.term.vector=false",
 656         "content.source.forever=false",
 657         "directory=RAMDirectory",
 658         "merge.scheduler=" + MyMergeScheduler.class.getName(),
 659         "doc.stored=false",
 660         "doc.tokenized=false",
 661         "debug.level=1",
 662         "# ----- alg ",
 663         "{ \"Rounds\"",
 664         "  ResetSystemErase",
 665         "  CreateIndex",
 666         "  { \"AddDocs\"  AddDoc > : * ",
 667         "} : 2",
 668     };
 669     // 2. execute the algorithm  (required in every "logic" test)
 670     Benchmark benchmark = execBenchmark(algLines);
 671
 672     assertTrue("did not use the specified MergeScheduler",
 673         ((MyMergeScheduler) benchmark.getRunData().getIndexWriter().getConfig()
 674             .getMergeScheduler()).called);
 675     benchmark.getRunData().getIndexWriter().close();
 676
 677     // 3. test number of docs in the index
 678     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 679     int ndocsExpected = 20; // first 20 reuters docs.
 680     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
 681     ir.close();
 682   }
 683
 684   public static class MyMergePolicy extends LogDocMergePolicy {
 685     boolean called;
 686     public MyMergePolicy() {
 687       called = true;
 688     }
 689   }
 690
 691   /**
 692    * Test that we can set merge policy".
 693    */
 694   public void testMergePolicy() throws Exception {
 695     // 1. alg definition (required in every "logic" test)
 696     String algLines[] = {
 697         "# ----- properties ",
 698         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 699         "docs.file=" + getReuters20LinesFile(),
 700         "content.source.log.step=3",
 701         "ram.flush.mb=-1",
 702         "max.buffered=2",
 703         "doc.term.vector=false",
 704         "content.source.forever=false",
 705         "directory=RAMDirectory",
 706         "merge.policy=" + MyMergePolicy.class.getName(),
 707         "doc.stored=false",
 708         "doc.tokenized=false",
 709         "debug.level=1",
 710         "# ----- alg ",
 711         "{ \"Rounds\"",
 712         "  ResetSystemErase",
 713         "  CreateIndex",
 714         "  { \"AddDocs\"  AddDoc > : * ",
 715         "} : 2",
 716     };
 717
 718     // 2. execute the algorithm  (required in every "logic" test)
 719     Benchmark benchmark = execBenchmark(algLines);
 720     assertTrue("did not use the specified MergePolicy", ((MyMergePolicy) benchmark.getRunData().getIndexWriter().getConfig().getMergePolicy()).called);
 721     benchmark.getRunData().getIndexWriter().close();
 722
 723     // 3. test number of docs in the index
 724     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 725     int ndocsExpected = 20; // first 20 reuters docs.
 726     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
 727     ir.close();
 728   }
 729
 730   /**
 731    * Test that IndexWriter settings stick.
 732    */
 733   public void testIndexWriterSettings() throws Exception {
 734     // 1. alg definition (required in every "logic" test)
 735     String algLines[] = {
 736         "# ----- properties ",
 737         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 738         "docs.file=" + getReuters20LinesFile(),
 739         "content.source.log.step=3",
 740         "ram.flush.mb=-1",
 741         "max.buffered=2",
 742         "compound=cmpnd:true:false",
 743         "doc.term.vector=vector:false:true",
 744         "content.source.forever=false",
 745         "directory=RAMDirectory",
 746         "doc.stored=false",
 747         "merge.factor=3",
 748         "doc.tokenized=false",
 749         "debug.level=1",
 750         "# ----- alg ",
 751         "{ \"Rounds\"",
 752         "  ResetSystemErase",
 753         "  CreateIndex",
 754         "  { \"AddDocs\"  AddDoc > : * ",
 755         "  NewRound",
 756         "} : 2",
 757     };
 758
 759     // 2. execute the algorithm  (required in every "logic" test)
 760     Benchmark benchmark = execBenchmark(algLines);
 761     final IndexWriter writer = benchmark.getRunData().getIndexWriter();
 762     assertEquals(2, writer.getConfig().getMaxBufferedDocs());
 763     assertEquals(IndexWriterConfig.DISABLE_AUTO_FLUSH, (int) writer.getConfig().getRAMBufferSizeMB());
 764     assertEquals(3, ((LogMergePolicy) writer.getConfig().getMergePolicy()).getMergeFactor());
 765     assertFalse(((LogMergePolicy) writer.getConfig().getMergePolicy()).getUseCompoundFile());
 766     writer.close();
 767     Directory dir = benchmark.getRunData().getDirectory();
 768     IndexReader reader = IndexReader.open(dir, true);
 769     TermFreqVector [] tfv = reader.getTermFreqVectors(0);
 770     assertNotNull(tfv);
 771     assertTrue(tfv.length > 0);
 772     reader.close();
 773   }
 774
 775   /**
 776    * Test that we can call optimize(maxNumSegments).
 777    */
 778   public void testOptimizeMaxNumSegments() throws Exception {
 779     // 1. alg definition (required in every "logic" test)
 780     String algLines[] = {
 781         "# ----- properties ",
 782         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 783         "docs.file=" + getReuters20LinesFile(),
 784         "content.source.log.step=3",
 785         "ram.flush.mb=-1",
 786         "max.buffered=3",
 787         "doc.term.vector=false",
 788         "content.source.forever=false",
 789         "directory=RAMDirectory",
 790         "merge.policy=org.apache.lucene.index.LogDocMergePolicy",
 791         "doc.stored=false",
 792         "doc.tokenized=false",
 793         "debug.level=1",
 794         "# ----- alg ",
 795         "{ \"Rounds\"",
 796         "  ResetSystemErase",
 797         "  CreateIndex",
 798         "  { \"AddDocs\"  AddDoc > : * ",
 799         "  Optimize(3)",
 800         "  CloseIndex()",
 801         "} : 2",
 802     };
 803
 804     // 2. execute the algorithm  (required in every "logic" test)
 805     Benchmark benchmark = execBenchmark(algLines);
 806
 807     // 3. test number of docs in the index
 808     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 809     int ndocsExpected = 20; // first 20 reuters docs.
 810     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
 811     ir.close();
 812
 813     // Make sure we have 3 segments:
 814     SegmentInfos infos = new SegmentInfos();
 815     infos.read(benchmark.getRunData().getDirectory());
 816     assertEquals(3, infos.size());
 817   }
 818
 819   /**
 820    * Test disabling task count (LUCENE-1136).
 821    */
 822   public void testDisableCounting() throws Exception {
 823     doTestDisableCounting(true);
 824     doTestDisableCounting(false);
 825   }
 826
 827   private void doTestDisableCounting(boolean disable) throws Exception {
 828     // 1. alg definition (required in every "logic" test)
 829     String algLines[] = disableCountingLines(disable);
 830
 831     // 2. execute the algorithm  (required in every "logic" test)
 832     Benchmark benchmark = execBenchmark(algLines);
 833
 834     // 3. test counters
 835     int n = disable ? 0 : 1;
 836     int nChecked = 0;
 837     for (final TaskStats stats : benchmark.getRunData().getPoints().taskStats()) {
 838       String taskName = stats.getTask().getName();
 839       if (taskName.equals("Rounds")) {
 840         assertEquals("Wrong total count!",20+2*n,stats.getCount());
 841         nChecked++;
 842       } else if (taskName.equals("CreateIndex")) {
 843         assertEquals("Wrong count for CreateIndex!",n,stats.getCount());
 844         nChecked++;
 845       } else if (taskName.equals("CloseIndex")) {
 846         assertEquals("Wrong count for CloseIndex!",n,stats.getCount());
 847         nChecked++;
 848       }
 849     }
 850     assertEquals("Missing some tasks to check!",3,nChecked);
 851   }
 852
 853   private String[] disableCountingLines (boolean disable) {
 854     String dis = disable ? "-" : "";
 855     return new String[] {
 856         "# ----- properties ",
 857         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 858         "docs.file=" + getReuters20LinesFile(),
 859         "content.source.log.step=30",
 860         "doc.term.vector=false",
 861         "content.source.forever=false",
 862         "directory=RAMDirectory",
 863         "doc.stored=false",
 864         "doc.tokenized=false",
 865         "task.max.depth.log=1",
 866         "# ----- alg ",
 867         "{ \"Rounds\"",
 868         "  ResetSystemErase",
 869         "  "+dis+"CreateIndex",            // optionally disable counting here
 870         "  { \"AddDocs\"  AddDoc > : * ",
 871         "  "+dis+"  CloseIndex",             // optionally disable counting here (with extra blanks)
 872         "}",
 873         "RepSumByName",
 874     };
 875   }
 876
 877   /**
 878    * Test that we can change the Locale in the runData,
 879    * that it is parsed as we expect.
 880    */
 881   public void testLocale() throws Exception {
 882     // empty Locale: clear it (null)
 883     Benchmark benchmark = execBenchmark(getLocaleConfig(""));
 884     assertNull(benchmark.getRunData().getLocale());
 885
 886     // ROOT locale
 887     benchmark = execBenchmark(getLocaleConfig("ROOT"));
 888     assertEquals(new Locale(""), benchmark.getRunData().getLocale());
 889
 890     // specify just a language
 891     benchmark = execBenchmark(getLocaleConfig("de"));
 892     assertEquals(new Locale("de"), benchmark.getRunData().getLocale());
 893
 894     // specify language + country
 895     benchmark = execBenchmark(getLocaleConfig("en,US"));
 896     assertEquals(new Locale("en", "US"), benchmark.getRunData().getLocale());
 897
 898     // specify language + country + variant
 899     benchmark = execBenchmark(getLocaleConfig("no,NO,NY"));
 900     assertEquals(new Locale("no", "NO", "NY"), benchmark.getRunData().getLocale());
 901   }
 902
 903   private String[] getLocaleConfig(String localeParam) {
 904     String algLines[] = {
 905         "# ----- properties ",
 906         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 907         "docs.file=" + getReuters20LinesFile(),
 908         "content.source.log.step=3",
 909         "content.source.forever=false",
 910         "directory=RAMDirectory",
 911         "# ----- alg ",
 912         "{ \"Rounds\"",
 913         "  ResetSystemErase",
 914         "  NewLocale(" + localeParam + ")",
 915         "  CreateIndex",
 916         "  { \"AddDocs\"  AddDoc > : * ",
 917         "  NewRound",
 918         "} : 1",
 919     };
 920     return algLines;
 921   }
 922
 923   /**
 924    * Test that we can create CollationAnalyzers.
 925    */
 926   public void testCollator() throws Exception {
 927     // ROOT locale
 928     Benchmark benchmark = execBenchmark(getCollatorConfig("ROOT", "impl:jdk"));
 929     CollationKeyAnalyzer expected = new CollationKeyAnalyzer(Collator
 930         .getInstance(new Locale("")));
 931     assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
 932
 933     // specify just a language
 934     benchmark = execBenchmark(getCollatorConfig("de", "impl:jdk"));
 935     expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("de")));
 936     assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
 937
 938     // specify language + country
 939     benchmark = execBenchmark(getCollatorConfig("en,US", "impl:jdk"));
 940     expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("en",
 941         "US")));
 942     assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
 943
 944     // specify language + country + variant
 945     benchmark = execBenchmark(getCollatorConfig("no,NO,NY", "impl:jdk"));
 946     expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("no",
 947         "NO", "NY")));
 948     assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
 949   }
 950
 951   private void assertEqualCollation(Analyzer a1, Analyzer a2, String text)
 952       throws Exception {
 953     TokenStream ts1 = a1.tokenStream("bogus", new StringReader(text));
 954     TokenStream ts2 = a2.tokenStream("bogus", new StringReader(text));
 955     ts1.reset();
 956     ts2.reset();
 957     CharTermAttribute termAtt1 = ts1.addAttribute(CharTermAttribute.class);
 958     CharTermAttribute termAtt2 = ts2.addAttribute(CharTermAttribute.class);
 959     assertTrue(ts1.incrementToken());
 960     assertTrue(ts2.incrementToken());
 961     assertEquals(termAtt1.toString(), termAtt2.toString());
 962     assertFalse(ts1.incrementToken());
 963     assertFalse(ts2.incrementToken());
 964     ts1.close();
 965     ts2.close();
 966   }
 967
 968   private String[] getCollatorConfig(String localeParam,
 969       String collationParam) {
 970     String algLines[] = {
 971         "# ----- properties ",
 972         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 973         "docs.file=" + getReuters20LinesFile(),
 974         "content.source.log.step=3",
 975         "content.source.forever=false",
 976         "directory=RAMDirectory",
 977         "# ----- alg ",
 978         "{ \"Rounds\"",
 979         "  ResetSystemErase",
 980         "  NewLocale(" + localeParam + ")",
 981         "  NewCollationAnalyzer(" + collationParam + ")",
 982         "  CreateIndex",
 983         "  { \"AddDocs\"  AddDoc > : * ",
 984         "  NewRound",
 985         "} : 1",
 986     };
 987     return algLines;
 988   }
 989
 990   /**
 991    * Test that we can create ShingleAnalyzerWrappers.
 992    */
 993   public void testShingleAnalyzer() throws Exception {
 994     String text = "one,two,three, four five six";
 995
 996     // Default analyzer, maxShingleSize, and outputUnigrams
 997     Benchmark benchmark = execBenchmark(getShingleConfig(""));
 998     benchmark.getRunData().getAnalyzer().tokenStream
 999       ("bogus", new StringReader(text)).close();
1000     assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
1001                        new String[] {"one", "one two", "two", "two three",
1002                                      "three", "three four", "four", "four five",
1003                                      "five", "five six", "six"});
1004     // Default analyzer, maxShingleSize = 3, and outputUnigrams = false
1005     benchmark = execBenchmark
1006       (getShingleConfig("maxShingleSize:3,outputUnigrams:false"));
1007     assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
1008                        new String[] { "one two", "one two three", "two three",
1009                                       "two three four", "three four",
1010                                       "three four five", "four five",
1011                                       "four five six", "five six" });
1012     // WhitespaceAnalyzer, default maxShingleSize and outputUnigrams
1013     benchmark = execBenchmark
1014       (getShingleConfig("analyzer:WhitespaceAnalyzer"));
1015     assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
1016                        new String[] { "one,two,three,", "one,two,three, four",
1017                                       "four", "four five", "five", "five six",
1018                                       "six" });
1019
1020     // WhitespaceAnalyzer, maxShingleSize=3 and outputUnigrams=false
1021     benchmark = execBenchmark
1022       (getShingleConfig
1023         ("outputUnigrams:false,maxShingleSize:3,analyzer:WhitespaceAnalyzer"));
1024     assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
1025                        new String[] { "one,two,three, four",
1026                                       "one,two,three, four five",
1027                                       "four five", "four five six",
1028                                       "five six" });
1029   }
1030
1031   private void assertEqualShingle
1032     (Analyzer analyzer, String text, String[] expected) throws Exception {
1033     BaseTokenStreamTestCase.assertAnalyzesTo(analyzer, text, expected);
1034   }
1035
1036   private String[] getShingleConfig(String params) {
1037     String algLines[] = {
1038         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
1039         "docs.file=" + getReuters20LinesFile(),
1040         "content.source.forever=false",
1041         "directory=RAMDirectory",
1042         "NewShingleAnalyzer(" + params + ")",
1043         "CreateIndex",
1044         "{ \"AddDocs\"  AddDoc > : * "
1045     };
1046     return algLines;
1047   }
1048
1049   private String getReuters20LinesFile() {
1050     return getWorkDirResourcePath("reuters.first20.lines.txt");
1051   }
1052 }