lucene-java-3.5.0/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 package org.apache.lucene.benchmark.byTask;
  19
  20 import java.io.BufferedReader;
  21 import java.io.File;
  22 import java.io.FileReader;
  23 import java.io.StringReader;
  24 import java.text.Collator;
  25 import java.util.List;
  26 import java.util.Locale;
  27
  28 import org.apache.lucene.analysis.Analyzer;
  29 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  30 import org.apache.lucene.analysis.MockAnalyzer;
  31 import org.apache.lucene.analysis.TokenStream;
  32 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  33 import org.apache.lucene.benchmark.BenchmarkTestCase;
  34 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
  35 import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
  36 import org.apache.lucene.benchmark.byTask.stats.TaskStats;
  37 import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
  38 import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
  39 import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
  40 import org.apache.lucene.collation.CollationKeyAnalyzer;
  41 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
  42 import org.apache.lucene.index.IndexReader;
  43 import org.apache.lucene.index.IndexWriter;
  44 import org.apache.lucene.index.IndexWriterConfig;
  45 import org.apache.lucene.index.LogDocMergePolicy;
  46 import org.apache.lucene.index.LogMergePolicy;
  47 import org.apache.lucene.index.Term;
  48 import org.apache.lucene.index.TermEnum;
  49 import org.apache.lucene.index.TermDocs;
  50 import org.apache.lucene.index.SegmentInfos;
  51 import org.apache.lucene.index.SerialMergeScheduler;
  52 import org.apache.lucene.index.TermFreqVector;
  53 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
  54 import org.apache.lucene.store.Directory;
  55 import org.apache.lucene.search.FieldCache.StringIndex;
  56 import org.apache.lucene.search.FieldCache;
  57
  58 /**
  59  * Test very simply that perf tasks - simple algorithms - are doing what they should.
  60  */
  61 public class TestPerfTasksLogic extends BenchmarkTestCase {
  62
  63   @Override
  64   public void setUp() throws Exception {
  65     super.setUp();
  66     copyToWorkDir("reuters.first20.lines.txt");
  67   }
  68
  69   /**
  70    * Test index creation logic
  71    */
  72   public void testIndexAndSearchTasks() throws Exception {
  73     // 1. alg definition (required in every "logic" test)
  74     String algLines[] = {
  75         "ResetSystemErase",
  76         "CreateIndex",
  77         "{ AddDoc } : 1000",
  78         "ForceMerge(1)",
  79         "CloseIndex",
  80         "OpenReader",
  81         "{ CountingSearchTest } : 200",
  82         "CloseReader",
  83         "[ CountingSearchTest > : 70",
  84         "[ CountingSearchTest > : 9",
  85     };
  86
  87     // 2. we test this value later
  88     CountingSearchTestTask.numSearches = 0;
  89
  90     // 3. execute the algorithm  (required in every "logic" test)
  91     Benchmark benchmark = execBenchmark(algLines);
  92
  93     // 4. test specific checks after the benchmark run completed.
  94     assertEquals("TestSearchTask was supposed to be called!",279,CountingSearchTestTask.numSearches);
  95     assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
  96     // now we should be able to open the index for write.
  97     IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),
  98         new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
  99             .setOpenMode(OpenMode.APPEND));
 100     iw.close();
 101     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 102     assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
 103     ir.close();
 104   }
 105
 106   /**
 107    * Test timed sequence task.
 108    */
 109   public void testTimedSearchTask() throws Exception {
 110     String algLines[] = {
 111         "log.step=100000",
 112         "ResetSystemErase",
 113         "CreateIndex",
 114         "{ AddDoc } : 100",
 115         "ForceMerge(1)",
 116         "CloseIndex",
 117         "OpenReader",
 118         "{ CountingSearchTest } : .5s",
 119         "CloseReader",
 120     };
 121
 122     CountingSearchTestTask.numSearches = 0;
 123     execBenchmark(algLines);
 124     assertTrue(CountingSearchTestTask.numSearches > 0);
 125     long elapsed = CountingSearchTestTask.prevLastMillis - CountingSearchTestTask.startMillis;
 126     assertTrue("elapsed time was " + elapsed + " msec", elapsed <= 1500);
 127   }
 128
 129   // disabled until we fix BG thread prio -- this test
 130   // causes build to hang
 131   public void testBGSearchTaskThreads() throws Exception {
 132     String algLines[] = {
 133         "log.time.step.msec = 100",
 134         "log.step=100000",
 135         "ResetSystemErase",
 136         "CreateIndex",
 137         "{ AddDoc } : 1000",
 138         "ForceMerge(1)",
 139         "CloseIndex",
 140         "OpenReader",
 141         "{",
 142         "  [ \"XSearch\" { CountingSearchTest > : * ] : 2 &-1",
 143         "  Wait(0.5)",
 144         "}",
 145         "CloseReader",
 146         "RepSumByPref X"
 147     };
 148
 149     CountingSearchTestTask.numSearches = 0;
 150     execBenchmark(algLines);
 151     assertTrue(CountingSearchTestTask.numSearches > 0);
 152   }
 153
 154   public void testHighlighting() throws Exception {
 155     // 1. alg definition (required in every "logic" test)
 156     String algLines[] = {
 157         "doc.stored=true",
 158         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 159         "docs.file=" + getReuters20LinesFile(),
 160         "query.maker=" + ReutersQueryMaker.class.getName(),
 161         "ResetSystemErase",
 162         "CreateIndex",
 163         "{ AddDoc } : 100",
 164         "ForceMerge(1)",
 165         "CloseIndex",
 166         "OpenReader(true)",
 167         "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
 168         "CloseReader",
 169     };
 170
 171     // 2. we test this value later
 172     CountingHighlighterTestTask.numHighlightedResults = 0;
 173     CountingHighlighterTestTask.numDocsRetrieved = 0;
 174     // 3. execute the algorithm  (required in every "logic" test)
 175     Benchmark benchmark = execBenchmark(algLines);
 176
 177     // 4. test specific checks after the benchmark run completed.
 178     assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved);
 179     //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
 180     //we probably should use a different doc/query maker, but...
 181     assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
 182
 183     assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
 184     // now we should be able to open the index for write.
 185     IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
 186     iw.close();
 187     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 188     assertEquals("100 docs were added to the index, this is what we expect to find!",100,ir.numDocs());
 189     ir.close();
 190   }
 191
 192   public void testHighlightingTV() throws Exception {
 193     // 1. alg definition (required in every "logic" test)
 194     String algLines[] = {
 195         "doc.stored=true",//doc storage is required in order to have text to highlight
 196         "doc.term.vector.offsets=true",
 197         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 198         "docs.file=" + getReuters20LinesFile(),
 199         "query.maker=" + ReutersQueryMaker.class.getName(),
 200         "ResetSystemErase",
 201         "CreateIndex",
 202         "{ AddDoc } : 1000",
 203         "ForceMerge(1)",
 204         "CloseIndex",
 205         "OpenReader(false)",
 206         "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
 207         "CloseReader",
 208     };
 209
 210     // 2. we test this value later
 211     CountingHighlighterTestTask.numHighlightedResults = 0;
 212     CountingHighlighterTestTask.numDocsRetrieved = 0;
 213     // 3. execute the algorithm  (required in every "logic" test)
 214     Benchmark benchmark = execBenchmark(algLines);
 215
 216     // 4. test specific checks after the benchmark run completed.
 217     assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved);
 218     //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
 219     //we probably should use a different doc/query maker, but...
 220     assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
 221
 222     assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
 223     // now we should be able to open the index for write.
 224     IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
 225     iw.close();
 226     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 227     assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
 228     ir.close();
 229   }
 230
 231   public void testHighlightingNoTvNoStore() throws Exception {
 232     // 1. alg definition (required in every "logic" test)
 233     String algLines[] = {
 234         "doc.stored=false",
 235         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 236         "docs.file=" + getReuters20LinesFile(),
 237         "query.maker=" + ReutersQueryMaker.class.getName(),
 238         "ResetSystemErase",
 239         "CreateIndex",
 240         "{ AddDoc } : 1000",
 241         "ForceMerge(1)",
 242         "CloseIndex",
 243         "OpenReader",
 244         "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
 245         "CloseReader",
 246     };
 247
 248     // 2. we test this value later
 249     CountingHighlighterTestTask.numHighlightedResults = 0;
 250     CountingHighlighterTestTask.numDocsRetrieved = 0;
 251     // 3. execute the algorithm  (required in every "logic" test)
 252     try {
 253       Benchmark benchmark = execBenchmark(algLines);
 254       assertTrue("CountingHighlighterTest should have thrown an exception", false);
 255       assertNotNull(benchmark); // (avoid compile warning on unused variable)
 256     } catch (Exception e) {
 257       assertTrue(true);
 258     }
 259   }
 260
 261   /**
 262    * Test Exhasting Doc Maker logic
 263    */
 264   public void testExhaustContentSource() throws Exception {
 265     // 1. alg definition (required in every "logic" test)
 266     String algLines[] = {
 267         "# ----- properties ",
 268         "content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource",
 269         "content.source.log.step=1",
 270         "doc.term.vector=false",
 271         "content.source.forever=false",
 272         "directory=RAMDirectory",
 273         "doc.stored=false",
 274         "doc.tokenized=false",
 275         "# ----- alg ",
 276         "CreateIndex",
 277         "{ AddDoc } : * ",
 278         "ForceMerge(1)",
 279         "CloseIndex",
 280         "OpenReader",
 281         "{ CountingSearchTest } : 100",
 282         "CloseReader",
 283         "[ CountingSearchTest > : 30",
 284         "[ CountingSearchTest > : 9",
 285     };
 286
 287     // 2. we test this value later
 288     CountingSearchTestTask.numSearches = 0;
 289
 290     // 3. execute the algorithm  (required in every "logic" test)
 291     Benchmark benchmark = execBenchmark(algLines);
 292
 293     // 4. test specific checks after the benchmark run completed.
 294     assertEquals("TestSearchTask was supposed to be called!",139,CountingSearchTestTask.numSearches);
 295     assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
 296     // now we should be able to open the index for write.
 297     IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
 298     iw.close();
 299     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 300     assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs());
 301     ir.close();
 302   }
 303
 304   // LUCENE-1994: test thread safety of SortableSingleDocMaker
 305   public void testDocMakerThreadSafety() throws Exception {
 306     // 1. alg definition (required in every "logic" test)
 307     String algLines[] = {
 308         "# ----- properties ",
 309         "content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource",
 310         "doc.term.vector=false",
 311         "log.step.AddDoc=10000",
 312         "content.source.forever=true",
 313         "directory=RAMDirectory",
 314         "doc.reuse.fields=false",
 315         "doc.stored=false",
 316         "doc.tokenized=false",
 317         "doc.index.props=true",
 318         "# ----- alg ",
 319         "CreateIndex",
 320         "[ { AddDoc > : 250 ] : 4",
 321         "CloseIndex",
 322     };
 323
 324     // 2. we test this value later
 325     CountingSearchTestTask.numSearches = 0;
 326
 327     // 3. execute the algorithm  (required in every "logic" test)
 328     Benchmark benchmark = execBenchmark(algLines);
 329
 330     IndexReader r = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 331     StringIndex idx = FieldCache.DEFAULT.getStringIndex(r, "country");
 332     final int maxDoc = r.maxDoc();
 333     assertEquals(1000, maxDoc);
 334     for(int i=0;i<1000;i++) {
 335       assertNotNull("doc " + i + " has null country", idx.lookup[idx.order[i]]);
 336     }
 337     r.close();
 338   }
 339
 340   /**
 341    * Test Parallel Doc Maker logic (for LUCENE-940)
 342    */
 343   public void testParallelDocMaker() throws Exception {
 344     // 1. alg definition (required in every "logic" test)
 345     String algLines[] = {
 346         "# ----- properties ",
 347         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 348         "docs.file=" + getReuters20LinesFile(),
 349         "content.source.log.step=3",
 350         "doc.term.vector=false",
 351         "content.source.forever=false",
 352         "directory=FSDirectory",
 353         "doc.stored=false",
 354         "doc.tokenized=false",
 355         "# ----- alg ",
 356         "CreateIndex",
 357         "[ { AddDoc } : * ] : 4 ",
 358         "CloseIndex",
 359     };
 360
 361     // 2. execute the algorithm  (required in every "logic" test)
 362     Benchmark benchmark = execBenchmark(algLines);
 363
 364     // 3. test number of docs in the index
 365     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 366     int ndocsExpected = 20; // first 20 reuters docs.
 367     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
 368     ir.close();
 369   }
 370
 371   /**
 372    * Test WriteLineDoc and LineDocSource.
 373    */
 374   public void testLineDocFile() throws Exception {
 375     File lineFile = new File(TEMP_DIR, "test.reuters.lines.txt");
 376
 377     // We will call WriteLineDocs this many times
 378     final int NUM_TRY_DOCS = 50;
 379
 380     // Creates a line file with first 50 docs from SingleDocSource
 381     String algLines1[] = {
 382       "# ----- properties ",
 383       "content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource",
 384       "content.source.forever=true",
 385       "line.file.out=" + lineFile.getAbsolutePath().replace('\\', '/'),
 386       "# ----- alg ",
 387       "{WriteLineDoc()}:" + NUM_TRY_DOCS,
 388     };
 389
 390     // Run algo
 391     Benchmark benchmark = execBenchmark(algLines1);
 392
 393     BufferedReader r = new BufferedReader(new FileReader(lineFile));
 394     int numLines = 0;
 395     String line;
 396     while((line = r.readLine()) != null) {
 397       if (numLines==0 && line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR)) {
 398         continue; // do not count the header line as a doc
 399       }
 400       numLines++;
 401     }
 402     r.close();
 403     assertEquals("did not see the right number of docs; should be " + NUM_TRY_DOCS + " but was " + numLines, NUM_TRY_DOCS, numLines);
 404
 405     // Index the line docs
 406     String algLines2[] = {
 407       "# ----- properties ",
 408       "analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer",
 409       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 410       "docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'),
 411       "content.source.forever=false",
 412       "doc.reuse.fields=false",
 413       "ram.flush.mb=4",
 414       "# ----- alg ",
 415       "ResetSystemErase",
 416       "CreateIndex",
 417       "{AddDoc}: *",
 418       "CloseIndex",
 419     };
 420
 421     // Run algo
 422     benchmark = execBenchmark(algLines2);
 423
 424     // now we should be able to open the index for write.
 425     IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),
 426         new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
 427             .setOpenMode(OpenMode.APPEND));
 428     iw.close();
 429
 430     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 431     assertEquals(numLines + " lines were created but " + ir.numDocs() + " docs are in the index", numLines, ir.numDocs());
 432     ir.close();
 433
 434     lineFile.delete();
 435   }
 436
 437   /**
 438    * Test ReadTokensTask
 439    */
 440   public void testReadTokens() throws Exception {
 441
 442     // We will call ReadTokens on this many docs
 443     final int NUM_DOCS = 20;
 444
 445     // Read tokens from first NUM_DOCS docs from Reuters and
 446     // then build index from the same docs
 447     String algLines1[] = {
 448       "# ----- properties ",
 449       "analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer",
 450       "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 451       "docs.file=" + getReuters20LinesFile(),
 452       "# ----- alg ",
 453       "{ReadTokens}: " + NUM_DOCS,
 454       "ResetSystemErase",
 455       "CreateIndex",
 456       "{AddDoc}: " + NUM_DOCS,
 457       "CloseIndex",
 458     };
 459
 460     // Run algo
 461     Benchmark benchmark = execBenchmark(algLines1);
 462
 463     List<TaskStats> stats = benchmark.getRunData().getPoints().taskStats();
 464
 465     // Count how many tokens all ReadTokens saw
 466     int totalTokenCount1 = 0;
 467     for (final TaskStats stat : stats) {
 468       if (stat.getTask().getName().equals("ReadTokens")) {
 469         totalTokenCount1 += stat.getCount();
 470       }
 471     }
 472
 473     // Separately count how many tokens are actually in the index:
 474     IndexReader reader = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 475     assertEquals(NUM_DOCS, reader.numDocs());
 476
 477     TermEnum terms = reader.terms();
 478     TermDocs termDocs = reader.termDocs();
 479     int totalTokenCount2 = 0;
 480     while(terms.next()) {
 481       Term term = terms.term();
 482       /* not-tokenized, but indexed field */
 483       if (term != null && term.field() != DocMaker.ID_FIELD && term.field() != DocMaker.DATE_MSEC_FIELD && term.field() != DocMaker.TIME_SEC_FIELD) {
 484           termDocs.seek(terms.term());
 485         while (termDocs.next())
 486           totalTokenCount2 += termDocs.freq();
 487       }
 488     }
 489     reader.close();
 490
 491     // Make sure they are the same
 492     assertEquals(totalTokenCount1, totalTokenCount2);
 493   }
 494
 495   /**
 496    * Test that " {[AddDoc(4000)]: 4} : * " works corrcetly (for LUCENE-941)
 497    */
 498   public void testParallelExhausted() throws Exception {
 499     // 1. alg definition (required in every "logic" test)
 500     String algLines[] = {
 501         "# ----- properties ",
 502         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 503         "docs.file=" + getReuters20LinesFile(),
 504         "content.source.log.step=3",
 505         "doc.term.vector=false",
 506         "content.source.forever=false",
 507         "directory=RAMDirectory",
 508         "doc.stored=false",
 509         "doc.tokenized=false",
 510         "task.max.depth.log=1",
 511         "# ----- alg ",
 512         "CreateIndex",
 513         "{ [ AddDoc]: 4} : * ",
 514         "ResetInputs ",
 515         "{ [ AddDoc]: 4} : * ",
 516         "WaitForMerges",
 517         "CloseIndex",
 518     };
 519
 520     // 2. execute the algorithm  (required in every "logic" test)
 521     Benchmark benchmark = execBenchmark(algLines);
 522
 523     // 3. test number of docs in the index
 524     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 525     int ndocsExpected = 2 * 20; // first 20 reuters docs.
 526     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
 527     ir.close();
 528   }
 529
 530
 531   /**
 532    * Test that exhaust in loop works as expected (LUCENE-1115).
 533    */
 534   public void testExhaustedLooped() throws Exception {
 535     // 1. alg definition (required in every "logic" test)
 536     String algLines[] = {
 537         "# ----- properties ",
 538         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 539         "docs.file=" + getReuters20LinesFile(),
 540         "content.source.log.step=3",
 541         "doc.term.vector=false",
 542         "content.source.forever=false",
 543         "directory=RAMDirectory",
 544         "doc.stored=false",
 545         "doc.tokenized=false",
 546         "task.max.depth.log=1",
 547         "# ----- alg ",
 548         "{ \"Rounds\"",
 549         "  ResetSystemErase",
 550         "  CreateIndex",
 551         "  { \"AddDocs\"  AddDoc > : * ",
 552         "  WaitForMerges",
 553         "  CloseIndex",
 554         "} : 2",
 555     };
 556
 557     // 2. execute the algorithm  (required in every "logic" test)
 558     Benchmark benchmark = execBenchmark(algLines);
 559
 560     // 3. test number of docs in the index
 561     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 562     int ndocsExpected = 20;  // first 20 reuters docs.
 563     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
 564     ir.close();
 565   }
 566
 567   /**
 568    * Test that we can close IndexWriter with argument "false".
 569    */
 570   public void testCloseIndexFalse() throws Exception {
 571     // 1. alg definition (required in every "logic" test)
 572     String algLines[] = {
 573         "# ----- properties ",
 574         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 575         "docs.file=" + getReuters20LinesFile(),
 576         "ram.flush.mb=-1",
 577         "max.buffered=2",
 578         "content.source.log.step=3",
 579         "doc.term.vector=false",
 580         "content.source.forever=false",
 581         "directory=RAMDirectory",
 582         "doc.stored=false",
 583         "doc.tokenized=false",
 584         "debug.level=1",
 585         "# ----- alg ",
 586         "{ \"Rounds\"",
 587         "  ResetSystemErase",
 588         "  CreateIndex",
 589         "  { \"AddDocs\"  AddDoc > : * ",
 590         "  CloseIndex(false)",
 591         "} : 2",
 592     };
 593
 594     // 2. execute the algorithm  (required in every "logic" test)
 595     Benchmark benchmark = execBenchmark(algLines);
 596
 597     // 3. test number of docs in the index
 598     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 599     int ndocsExpected = 20; // first 20 reuters docs.
 600     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
 601     ir.close();
 602   }
 603
 604   public static class MyMergeScheduler extends SerialMergeScheduler {
 605     boolean called;
 606     public MyMergeScheduler() {
 607       super();
 608       called = true;
 609     }
 610   }
 611
 612   public void testDeleteByPercent() throws Exception {
 613     // 1. alg definition (required in every "logic" test)
 614     String algLines[] = {
 615         "# ----- properties ",
 616         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 617         "docs.file=" + getReuters20LinesFile(),
 618         "ram.flush.mb=-1",
 619         "max.buffered=2",
 620         "content.source.log.step=3",
 621         "doc.term.vector=false",
 622         "content.source.forever=false",
 623         "directory=RAMDirectory",
 624         "doc.stored=false",
 625         "doc.tokenized=false",
 626         "debug.level=1",
 627         "# ----- alg ",
 628         "CreateIndex",
 629         "{ \"AddDocs\"  AddDoc > : * ",
 630         "CloseIndex()",
 631         "OpenReader(false)",
 632         "DeleteByPercent(20)",
 633         "CloseReader"
 634     };
 635
 636     // 2. execute the algorithm  (required in every "logic" test)
 637     Benchmark benchmark = execBenchmark(algLines);
 638
 639     // 3. test number of docs in the index
 640     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 641     int ndocsExpected = 16; // first 20 reuters docs, minus 20%
 642     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
 643     ir.close();
 644   }
 645
 646   /**
 647    * Test that we can set merge scheduler".
 648    */
 649   public void testMergeScheduler() throws Exception {
 650     // 1. alg definition (required in every "logic" test)
 651     String algLines[] = {
 652         "# ----- properties ",
 653         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 654         "docs.file=" + getReuters20LinesFile(),
 655         "content.source.log.step=3",
 656         "doc.term.vector=false",
 657         "content.source.forever=false",
 658         "directory=RAMDirectory",
 659         "merge.scheduler=" + MyMergeScheduler.class.getName(),
 660         "doc.stored=false",
 661         "doc.tokenized=false",
 662         "debug.level=1",
 663         "# ----- alg ",
 664         "{ \"Rounds\"",
 665         "  ResetSystemErase",
 666         "  CreateIndex",
 667         "  { \"AddDocs\"  AddDoc > : * ",
 668         "} : 2",
 669     };
 670     // 2. execute the algorithm  (required in every "logic" test)
 671     Benchmark benchmark = execBenchmark(algLines);
 672
 673     assertTrue("did not use the specified MergeScheduler",
 674         ((MyMergeScheduler) benchmark.getRunData().getIndexWriter().getConfig()
 675             .getMergeScheduler()).called);
 676     benchmark.getRunData().getIndexWriter().close();
 677
 678     // 3. test number of docs in the index
 679     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 680     int ndocsExpected = 20; // first 20 reuters docs.
 681     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
 682     ir.close();
 683   }
 684
 685   public static class MyMergePolicy extends LogDocMergePolicy {
 686     boolean called;
 687     public MyMergePolicy() {
 688       called = true;
 689     }
 690   }
 691
 692   /**
 693    * Test that we can set merge policy".
 694    */
 695   public void testMergePolicy() throws Exception {
 696     // 1. alg definition (required in every "logic" test)
 697     String algLines[] = {
 698         "# ----- properties ",
 699         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 700         "docs.file=" + getReuters20LinesFile(),
 701         "content.source.log.step=3",
 702         "ram.flush.mb=-1",
 703         "max.buffered=2",
 704         "doc.term.vector=false",
 705         "content.source.forever=false",
 706         "directory=RAMDirectory",
 707         "merge.policy=" + MyMergePolicy.class.getName(),
 708         "doc.stored=false",
 709         "doc.tokenized=false",
 710         "debug.level=1",
 711         "# ----- alg ",
 712         "{ \"Rounds\"",
 713         "  ResetSystemErase",
 714         "  CreateIndex",
 715         "  { \"AddDocs\"  AddDoc > : * ",
 716         "} : 2",
 717     };
 718
 719     // 2. execute the algorithm  (required in every "logic" test)
 720     Benchmark benchmark = execBenchmark(algLines);
 721     assertTrue("did not use the specified MergePolicy", ((MyMergePolicy) benchmark.getRunData().getIndexWriter().getConfig().getMergePolicy()).called);
 722     benchmark.getRunData().getIndexWriter().close();
 723
 724     // 3. test number of docs in the index
 725     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 726     int ndocsExpected = 20; // first 20 reuters docs.
 727     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
 728     ir.close();
 729   }
 730
 731   /**
 732    * Test that IndexWriter settings stick.
 733    */
 734   public void testIndexWriterSettings() throws Exception {
 735     // 1. alg definition (required in every "logic" test)
 736     String algLines[] = {
 737         "# ----- properties ",
 738         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 739         "docs.file=" + getReuters20LinesFile(),
 740         "content.source.log.step=3",
 741         "ram.flush.mb=-1",
 742         "max.buffered=2",
 743         "compound=cmpnd:true:false",
 744         "doc.term.vector=vector:false:true",
 745         "content.source.forever=false",
 746         "directory=RAMDirectory",
 747         "doc.stored=false",
 748         "merge.factor=3",
 749         "doc.tokenized=false",
 750         "debug.level=1",
 751         "# ----- alg ",
 752         "{ \"Rounds\"",
 753         "  ResetSystemErase",
 754         "  CreateIndex",
 755         "  { \"AddDocs\"  AddDoc > : * ",
 756         "  NewRound",
 757         "} : 2",
 758     };
 759
 760     // 2. execute the algorithm  (required in every "logic" test)
 761     Benchmark benchmark = execBenchmark(algLines);
 762     final IndexWriter writer = benchmark.getRunData().getIndexWriter();
 763     assertEquals(2, writer.getConfig().getMaxBufferedDocs());
 764     assertEquals(IndexWriterConfig.DISABLE_AUTO_FLUSH, (int) writer.getConfig().getRAMBufferSizeMB());
 765     assertEquals(3, ((LogMergePolicy) writer.getConfig().getMergePolicy()).getMergeFactor());
 766     assertFalse(((LogMergePolicy) writer.getConfig().getMergePolicy()).getUseCompoundFile());
 767     writer.close();
 768     Directory dir = benchmark.getRunData().getDirectory();
 769     IndexReader reader = IndexReader.open(dir, true);
 770     TermFreqVector [] tfv = reader.getTermFreqVectors(0);
 771     assertNotNull(tfv);
 772     assertTrue(tfv.length > 0);
 773     reader.close();
 774   }
 775
 776   /**
 777    * Test indexing with facets tasks.
 778    */
 779   public void testIndexingWithFacets() throws Exception {
 780     // 1. alg definition (required in every "logic" test)
 781     String algLines[] = {
 782         "# ----- properties ",
 783         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 784         "docs.file=" + getReuters20LinesFile(),
 785         "content.source.log.step=100",
 786         "content.source.forever=false",
 787         "directory=RAMDirectory",
 788         "doc.stored=false",
 789         "merge.factor=3",
 790         "doc.tokenized=false",
 791         "debug.level=1",
 792         "# ----- alg ",
 793         "ResetSystemErase",
 794         "CreateIndex",
 795         "CreateTaxonomyIndex",
 796         "{ \"AddDocs\"  AddFacetedDoc > : * ",
 797         "CloseIndex",
 798         "CloseTaxonomyIndex",
 799         "OpenTaxonomyReader",
 800     };
 801
 802     // 2. execute the algorithm  (required in every "logic" test)
 803     Benchmark benchmark = execBenchmark(algLines);
 804     PerfRunData runData = benchmark.getRunData();
 805     assertNull("taxo writer was not properly closed",runData.getTaxonomyWriter());
 806     TaxonomyReader taxoReader = runData.getTaxonomyReader();
 807     assertNotNull("taxo reader was not opened", taxoReader);
 808     assertTrue("nothing was added to the taxnomy (expecting root and at least one addtional category)",taxoReader.getSize()>1);
 809     taxoReader.close();
 810   }
 811
 812   /**
 813    * Test that we can call forceMerge(maxNumSegments).
 814    */
 815   public void testForceMerge() throws Exception {
 816     // 1. alg definition (required in every "logic" test)
 817     String algLines[] = {
 818         "# ----- properties ",
 819         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 820         "docs.file=" + getReuters20LinesFile(),
 821         "content.source.log.step=3",
 822         "ram.flush.mb=-1",
 823         "max.buffered=3",
 824         "doc.term.vector=false",
 825         "content.source.forever=false",
 826         "directory=RAMDirectory",
 827         "merge.policy=org.apache.lucene.index.LogDocMergePolicy",
 828         "doc.stored=false",
 829         "doc.tokenized=false",
 830         "debug.level=1",
 831         "# ----- alg ",
 832         "{ \"Rounds\"",
 833         "  ResetSystemErase",
 834         "  CreateIndex",
 835         "  { \"AddDocs\"  AddDoc > : * ",
 836         "  ForceMerge(3)",
 837         "  CloseIndex()",
 838         "} : 2",
 839     };
 840
 841     // 2. execute the algorithm  (required in every "logic" test)
 842     Benchmark benchmark = execBenchmark(algLines);
 843
 844     // 3. test number of docs in the index
 845     IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
 846     int ndocsExpected = 20; // first 20 reuters docs.
 847     assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
 848     ir.close();
 849
 850     // Make sure we have 3 segments:
 851     SegmentInfos infos = new SegmentInfos();
 852     infos.read(benchmark.getRunData().getDirectory());
 853     assertEquals(3, infos.size());
 854   }
 855
 856   /**
 857    * Test disabling task count (LUCENE-1136).
 858    */
 859   public void testDisableCounting() throws Exception {
 860     doTestDisableCounting(true);
 861     doTestDisableCounting(false);
 862   }
 863
 864   private void doTestDisableCounting(boolean disable) throws Exception {
 865     // 1. alg definition (required in every "logic" test)
 866     String algLines[] = disableCountingLines(disable);
 867
 868     // 2. execute the algorithm  (required in every "logic" test)
 869     Benchmark benchmark = execBenchmark(algLines);
 870
 871     // 3. test counters
 872     int n = disable ? 0 : 1;
 873     int nChecked = 0;
 874     for (final TaskStats stats : benchmark.getRunData().getPoints().taskStats()) {
 875       String taskName = stats.getTask().getName();
 876       if (taskName.equals("Rounds")) {
 877         assertEquals("Wrong total count!",20+2*n,stats.getCount());
 878         nChecked++;
 879       } else if (taskName.equals("CreateIndex")) {
 880         assertEquals("Wrong count for CreateIndex!",n,stats.getCount());
 881         nChecked++;
 882       } else if (taskName.equals("CloseIndex")) {
 883         assertEquals("Wrong count for CloseIndex!",n,stats.getCount());
 884         nChecked++;
 885       }
 886     }
 887     assertEquals("Missing some tasks to check!",3,nChecked);
 888   }
 889
 890   private String[] disableCountingLines (boolean disable) {
 891     String dis = disable ? "-" : "";
 892     return new String[] {
 893         "# ----- properties ",
 894         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 895         "docs.file=" + getReuters20LinesFile(),
 896         "content.source.log.step=30",
 897         "doc.term.vector=false",
 898         "content.source.forever=false",
 899         "directory=RAMDirectory",
 900         "doc.stored=false",
 901         "doc.tokenized=false",
 902         "task.max.depth.log=1",
 903         "# ----- alg ",
 904         "{ \"Rounds\"",
 905         "  ResetSystemErase",
 906         "  "+dis+"CreateIndex",            // optionally disable counting here
 907         "  { \"AddDocs\"  AddDoc > : * ",
 908         "  "+dis+"  CloseIndex",             // optionally disable counting here (with extra blanks)
 909         "}",
 910         "RepSumByName",
 911     };
 912   }
 913
 914   /**
 915    * Test that we can change the Locale in the runData,
 916    * that it is parsed as we expect.
 917    */
 918   public void testLocale() throws Exception {
 919     // empty Locale: clear it (null)
 920     Benchmark benchmark = execBenchmark(getLocaleConfig(""));
 921     assertNull(benchmark.getRunData().getLocale());
 922
 923     // ROOT locale
 924     benchmark = execBenchmark(getLocaleConfig("ROOT"));
 925     assertEquals(new Locale(""), benchmark.getRunData().getLocale());
 926
 927     // specify just a language
 928     benchmark = execBenchmark(getLocaleConfig("de"));
 929     assertEquals(new Locale("de"), benchmark.getRunData().getLocale());
 930
 931     // specify language + country
 932     benchmark = execBenchmark(getLocaleConfig("en,US"));
 933     assertEquals(new Locale("en", "US"), benchmark.getRunData().getLocale());
 934
 935     // specify language + country + variant
 936     benchmark = execBenchmark(getLocaleConfig("no,NO,NY"));
 937     assertEquals(new Locale("no", "NO", "NY"), benchmark.getRunData().getLocale());
 938   }
 939
 940   private String[] getLocaleConfig(String localeParam) {
 941     String algLines[] = {
 942         "# ----- properties ",
 943         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
 944         "docs.file=" + getReuters20LinesFile(),
 945         "content.source.log.step=3",
 946         "content.source.forever=false",
 947         "directory=RAMDirectory",
 948         "# ----- alg ",
 949         "{ \"Rounds\"",
 950         "  ResetSystemErase",
 951         "  NewLocale(" + localeParam + ")",
 952         "  CreateIndex",
 953         "  { \"AddDocs\"  AddDoc > : * ",
 954         "  NewRound",
 955         "} : 1",
 956     };
 957     return algLines;
 958   }
 959
 960   /**
 961    * Test that we can create CollationAnalyzers.
 962    */
 963   public void testCollator() throws Exception {
 964     // ROOT locale
 965     Benchmark benchmark = execBenchmark(getCollatorConfig("ROOT", "impl:jdk"));
 966     CollationKeyAnalyzer expected = new CollationKeyAnalyzer(Collator
 967         .getInstance(new Locale("")));
 968     assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
 969
 970     // specify just a language
 971     benchmark = execBenchmark(getCollatorConfig("de", "impl:jdk"));
 972     expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("de")));
 973     assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
 974
 975     // specify language + country
 976     benchmark = execBenchmark(getCollatorConfig("en,US", "impl:jdk"));
 977     expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("en",
 978         "US")));
 979     assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
 980
 981     // specify language + country + variant
 982     benchmark = execBenchmark(getCollatorConfig("no,NO,NY", "impl:jdk"));
 983     expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("no",
 984         "NO", "NY")));
 985     assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
 986   }
 987
 988   private void assertEqualCollation(Analyzer a1, Analyzer a2, String text)
 989       throws Exception {
 990     TokenStream ts1 = a1.tokenStream("bogus", new StringReader(text));
 991     TokenStream ts2 = a2.tokenStream("bogus", new StringReader(text));
 992     ts1.reset();
 993     ts2.reset();
 994     CharTermAttribute termAtt1 = ts1.addAttribute(CharTermAttribute.class);
 995     CharTermAttribute termAtt2 = ts2.addAttribute(CharTermAttribute.class);
 996     assertTrue(ts1.incrementToken());
 997     assertTrue(ts2.incrementToken());
 998     assertEquals(termAtt1.toString(), termAtt2.toString());
 999     assertFalse(ts1.incrementToken());
1000     assertFalse(ts2.incrementToken());
1001     ts1.close();
1002     ts2.close();
1003   }
1004
1005   private String[] getCollatorConfig(String localeParam,
1006       String collationParam) {
1007     String algLines[] = {
1008         "# ----- properties ",
1009         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
1010         "docs.file=" + getReuters20LinesFile(),
1011         "content.source.log.step=3",
1012         "content.source.forever=false",
1013         "directory=RAMDirectory",
1014         "# ----- alg ",
1015         "{ \"Rounds\"",
1016         "  ResetSystemErase",
1017         "  NewLocale(" + localeParam + ")",
1018         "  NewCollationAnalyzer(" + collationParam + ")",
1019         "  CreateIndex",
1020         "  { \"AddDocs\"  AddDoc > : * ",
1021         "  NewRound",
1022         "} : 1",
1023     };
1024     return algLines;
1025   }
1026
1027   /**
1028    * Test that we can create ShingleAnalyzerWrappers.
1029    */
1030   public void testShingleAnalyzer() throws Exception {
1031     String text = "one,two,three, four five six";
1032
1033     // Default analyzer, maxShingleSize, and outputUnigrams
1034     Benchmark benchmark = execBenchmark(getShingleConfig(""));
1035     benchmark.getRunData().getAnalyzer().tokenStream
1036       ("bogus", new StringReader(text)).close();
1037     assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
1038                        new String[] {"one", "one two", "two", "two three",
1039                                      "three", "three four", "four", "four five",
1040                                      "five", "five six", "six"});
1041     // Default analyzer, maxShingleSize = 3, and outputUnigrams = false
1042     benchmark = execBenchmark
1043       (getShingleConfig("maxShingleSize:3,outputUnigrams:false"));
1044     assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
1045                        new String[] { "one two", "one two three", "two three",
1046                                       "two three four", "three four",
1047                                       "three four five", "four five",
1048                                       "four five six", "five six" });
1049     // WhitespaceAnalyzer, default maxShingleSize and outputUnigrams
1050     benchmark = execBenchmark
1051       (getShingleConfig("analyzer:WhitespaceAnalyzer"));
1052     assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
1053                        new String[] { "one,two,three,", "one,two,three, four",
1054                                       "four", "four five", "five", "five six",
1055                                       "six" });
1056
1057     // WhitespaceAnalyzer, maxShingleSize=3 and outputUnigrams=false
1058     benchmark = execBenchmark
1059       (getShingleConfig
1060         ("outputUnigrams:false,maxShingleSize:3,analyzer:WhitespaceAnalyzer"));
1061     assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
1062                        new String[] { "one,two,three, four",
1063                                       "one,two,three, four five",
1064                                       "four five", "four five six",
1065                                       "five six" });
1066   }
1067
1068   private void assertEqualShingle
1069     (Analyzer analyzer, String text, String[] expected) throws Exception {
1070     BaseTokenStreamTestCase.assertAnalyzesTo(analyzer, text, expected);
1071   }
1072
1073   private String[] getShingleConfig(String params) {
1074     String algLines[] = {
1075         "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
1076         "docs.file=" + getReuters20LinesFile(),
1077         "content.source.forever=false",
1078         "directory=RAMDirectory",
1079         "NewShingleAnalyzer(" + params + ")",
1080         "CreateIndex",
1081         "{ \"AddDocs\"  AddDoc > : * "
1082     };
1083     return algLines;
1084   }
1085
1086   private String getReuters20LinesFile() {
1087     return getWorkDirResourcePath("reuters.first20.lines.txt");
1088   }
1089 }