lucene-java-3.4.0/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java

   1 package org.apache.lucene.search.vectorhighlight;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.Reader;
  22 import java.util.Collection;
  23
  24 import org.apache.lucene.analysis.Analyzer;
  25 import org.apache.lucene.analysis.MockAnalyzer;
  26 import org.apache.lucene.analysis.MockTokenizer;
  27 import org.apache.lucene.analysis.TokenStream;
  28 import org.apache.lucene.analysis.Tokenizer;
  29 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  30 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  31 import org.apache.lucene.document.Document;
  32 import org.apache.lucene.document.Field;
  33 import org.apache.lucene.document.Field.Index;
  34 import org.apache.lucene.document.Field.Store;
  35 import org.apache.lucene.document.Field.TermVector;
  36 import org.apache.lucene.index.IndexReader;
  37 import org.apache.lucene.index.IndexWriter;
  38 import org.apache.lucene.index.IndexWriterConfig;
  39 import org.apache.lucene.index.Term;
  40 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
  41 import org.apache.lucene.queryParser.QueryParser;
  42 import org.apache.lucene.search.DisjunctionMaxQuery;
  43 import org.apache.lucene.search.PhraseQuery;
  44 import org.apache.lucene.search.Query;
  45 import org.apache.lucene.search.TermQuery;
  46 import org.apache.lucene.store.Directory;
  47 import org.apache.lucene.util.LuceneTestCase;
  48
  49 public abstract class AbstractTestCase extends LuceneTestCase {
  50
  51   protected final String F = "f";
  52   protected final String F1 = "f1";
  53   protected final String F2 = "f2";
  54   protected Directory dir;
  55   protected Analyzer analyzerW;
  56   protected Analyzer analyzerB;
  57   protected Analyzer analyzerK;
  58   protected IndexReader reader;
  59   protected QueryParser paW;
  60   protected QueryParser paB;
  61
  62   protected static final String[] shortMVValues = {
  63     "",
  64     "",
  65     "a b c",
  66     "",   // empty data in multi valued field
  67     "d e"
  68   };
  69
  70   protected static final String[] longMVValues = {
  71     "Followings are the examples of customizable parameters and actual examples of customization:",
  72     "The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically"
  73   };
  74
  75   // test data for LUCENE-1448 bug
  76   protected static final String[] biMVValues = {
  77     "\nLucene/Solr does not require such additional hardware.",
  78     "\nWhen you talk about processing speed, the"
  79   };
  80
  81   protected static final String[] strMVValues = {
  82     "abc",
  83     "defg",
  84     "hijkl"
  85   };
  86
  87   @Override
  88   public void setUp() throws Exception {
  89     super.setUp();
  90     analyzerW = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
  91     analyzerB = new BigramAnalyzer();
  92     analyzerK = new MockAnalyzer(random, MockTokenizer.KEYWORD, false);
  93     paW = new QueryParser(TEST_VERSION_CURRENT,  F, analyzerW );
  94     paB = new QueryParser(TEST_VERSION_CURRENT,  F, analyzerB );
  95     dir = newDirectory();
  96   }
  97
  98   @Override
  99   public void tearDown() throws Exception {
 100     if( reader != null ){
 101       reader.close();
 102       reader = null;
 103     }
 104     dir.close();
 105     super.tearDown();
 106   }
 107
 108   protected Query tq( String text ){
 109     return tq( 1F, text );
 110   }
 111
 112   protected Query tq( float boost, String text ){
 113     return tq( boost, F, text );
 114   }
 115
 116   protected Query tq( String field, String text ){
 117     return tq( 1F, field, text );
 118   }
 119
 120   protected Query tq( float boost, String field, String text ){
 121     Query query = new TermQuery( new Term( field, text ) );
 122     query.setBoost( boost );
 123     return query;
 124   }
 125
 126   protected Query pqF( String... texts ){
 127     return pqF( 1F, texts );
 128   }
 129
 130   protected Query pqF( float boost, String... texts ){
 131     return pqF( boost, 0, texts );
 132   }
 133
 134   protected Query pqF( float boost, int slop, String... texts ){
 135     return pq( boost, slop, F, texts );
 136   }
 137
 138   protected Query pq( String field, String... texts ){
 139     return pq( 1F, 0, field, texts );
 140   }
 141
 142   protected Query pq( float boost, String field, String... texts ){
 143     return pq( boost, 0, field, texts );
 144   }
 145
 146   protected Query pq( float boost, int slop, String field, String... texts ){
 147     PhraseQuery query = new PhraseQuery();
 148     for( String text : texts ){
 149       query.add( new Term( field, text ) );
 150     }
 151     query.setBoost( boost );
 152     query.setSlop( slop );
 153     return query;
 154   }
 155
 156   protected Query dmq( Query... queries ){
 157     return dmq( 0.0F, queries );
 158   }
 159
 160   protected Query dmq( float tieBreakerMultiplier, Query... queries ){
 161     DisjunctionMaxQuery query = new DisjunctionMaxQuery( tieBreakerMultiplier );
 162     for( Query q : queries ){
 163       query.add( q );
 164     }
 165     return query;
 166   }
 167
 168   protected void assertCollectionQueries( Collection<Query> actual, Query... expected ){
 169     assertEquals( expected.length, actual.size() );
 170     for( Query query : expected ){
 171       assertTrue( actual.contains( query ) );
 172     }
 173   }
 174
 175   static final class BigramAnalyzer extends Analyzer {
 176     @Override
 177     public TokenStream tokenStream(String fieldName, Reader reader) {
 178       return new BasicNGramTokenizer( reader );
 179     }
 180   }
 181
 182   static final class BasicNGramTokenizer extends Tokenizer {
 183
 184     public static final int DEFAULT_N_SIZE = 2;
 185     public static final String DEFAULT_DELIMITERS = " \t\n.,";
 186     private final int n;
 187     private final String delimiters;
 188     private int startTerm;
 189     private int lenTerm;
 190     private int startOffset;
 191     private int nextStartOffset;
 192     private int ch;
 193     private String snippet;
 194     private StringBuilder snippetBuffer;
 195     private static final int BUFFER_SIZE = 4096;
 196     private char[] charBuffer;
 197     private int charBufferIndex;
 198     private int charBufferLen;
 199
 200     public BasicNGramTokenizer( Reader in ){
 201       this( in, DEFAULT_N_SIZE );
 202     }
 203
 204     public BasicNGramTokenizer( Reader in, int n ){
 205       this( in, n, DEFAULT_DELIMITERS );
 206     }
 207
 208     public BasicNGramTokenizer( Reader in, String delimiters ){
 209       this( in, DEFAULT_N_SIZE, delimiters );
 210     }
 211
 212     public BasicNGramTokenizer( Reader in, int n, String delimiters ){
 213       super(in);
 214       this.n = n;
 215       this.delimiters = delimiters;
 216       startTerm = 0;
 217       nextStartOffset = 0;
 218       snippet = null;
 219       snippetBuffer = new StringBuilder();
 220       charBuffer = new char[BUFFER_SIZE];
 221       charBufferIndex = BUFFER_SIZE;
 222       charBufferLen = 0;
 223       ch = 0;
 224     }
 225
 226     CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 227     OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 228     @Override
 229     public boolean incrementToken() throws IOException {
 230       if( !getNextPartialSnippet() )
 231         return false;
 232       clearAttributes();
 233       termAtt.setEmpty().append(snippet, startTerm, startTerm + lenTerm);
 234       offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm));
 235       return true;
 236     }
 237
 238     private int getFinalOffset() {
 239       return nextStartOffset;
 240     }
 241
 242     @Override
 243     public final void end(){
 244       offsetAtt.setOffset(getFinalOffset(),getFinalOffset());
 245     }
 246
 247     protected boolean getNextPartialSnippet() throws IOException {
 248       if( snippet != null && snippet.length() >= startTerm + 1 + n ){
 249         startTerm++;
 250         startOffset++;
 251         lenTerm = n;
 252         return true;
 253       }
 254       return getNextSnippet();
 255     }
 256
 257     protected boolean getNextSnippet() throws IOException {
 258       startTerm = 0;
 259       startOffset = nextStartOffset;
 260       snippetBuffer.delete( 0, snippetBuffer.length() );
 261       while( true ){
 262         if( ch != -1 )
 263           ch = readCharFromBuffer();
 264         if( ch == -1 ) break;
 265         else if( !isDelimiter( ch ) )
 266           snippetBuffer.append( (char)ch );
 267         else if( snippetBuffer.length() > 0 )
 268           break;
 269         else
 270           startOffset++;
 271       }
 272       if( snippetBuffer.length() == 0 )
 273         return false;
 274       snippet = snippetBuffer.toString();
 275       lenTerm = snippet.length() >= n ? n : snippet.length();
 276       return true;
 277     }
 278
 279     protected int readCharFromBuffer() throws IOException {
 280       if( charBufferIndex >= charBufferLen ){
 281         charBufferLen = input.read( charBuffer );
 282         if( charBufferLen == -1 ){
 283           return -1;
 284         }
 285         charBufferIndex = 0;
 286       }
 287       int c = charBuffer[charBufferIndex++];
 288       nextStartOffset++;
 289       return c;
 290     }
 291
 292     protected boolean isDelimiter( int c ){
 293       return delimiters.indexOf( c ) >= 0;
 294     }
 295
 296     @Override
 297     public void reset( Reader input ) throws IOException {
 298       super.reset( input );
 299       reset();
 300     }
 301
 302     @Override
 303     public void reset() throws IOException {
 304       startTerm = 0;
 305       nextStartOffset = 0;
 306       snippet = null;
 307       snippetBuffer.setLength( 0 );
 308       charBufferIndex = BUFFER_SIZE;
 309       charBufferLen = 0;
 310       ch = 0;
 311     }
 312   }
 313
 314   protected void make1d1fIndex( String value ) throws Exception {
 315     make1dmfIndex( value );
 316   }
 317
 318   protected void make1d1fIndexB( String value ) throws Exception {
 319     make1dmfIndexB( value );
 320   }
 321
 322   protected void make1dmfIndex( String... values ) throws Exception {
 323     make1dmfIndex( analyzerW, values );
 324   }
 325
 326   protected void make1dmfIndexB( String... values ) throws Exception {
 327     make1dmfIndex( analyzerB, values );
 328   }
 329
 330   // make 1 doc with multi valued field
 331   protected void make1dmfIndex( Analyzer analyzer, String... values ) throws Exception {
 332     IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
 333         TEST_VERSION_CURRENT, analyzer).setOpenMode(OpenMode.CREATE));
 334     Document doc = new Document();
 335     for( String value: values )
 336       doc.add( new Field( F, value, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
 337     writer.addDocument( doc );
 338     writer.close();
 339     if (reader != null) reader.close();
 340     reader = IndexReader.open( dir, true );
 341   }
 342
 343   // make 1 doc with multi valued & not analyzed field
 344   protected void make1dmfIndexNA( String... values ) throws Exception {
 345     IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
 346         TEST_VERSION_CURRENT, analyzerK).setOpenMode(OpenMode.CREATE));
 347     Document doc = new Document();
 348     for( String value: values )
 349       doc.add( new Field( F, value, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
 350     writer.addDocument( doc );
 351     writer.close();
 352     if (reader != null) reader.close();
 353     reader = IndexReader.open( dir, true );
 354   }
 355
 356   protected void makeIndexShortMV() throws Exception {
 357
 358     //  0
 359     // ""
 360     //  1
 361     // ""
 362
 363     //  234567
 364     // "a b c"
 365     //  0 1 2
 366
 367     //  8
 368     // ""
 369
 370     //   111
 371     //  9012
 372     // "d e"
 373     //  3 4
 374     make1dmfIndex( shortMVValues );
 375   }
 376
 377   protected void makeIndexLongMV() throws Exception {
 378     //           11111111112222222222333333333344444444445555555555666666666677777777778888888888999
 379     // 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
 380     // Followings are the examples of customizable parameters and actual examples of customization:
 381     // 0          1   2   3        4  5            6          7   8      9        10 11
 382
 383     //        1                                                                                                   2
 384     // 999999900000000001111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000111111111122
 385     // 345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901
 386     // The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically
 387     // 12  13  (14)   (15)     16  17   18  19 20    21       22   23 (24)   (25)     26   27   28   29  30  31  32   33      34
 388
 389     make1dmfIndex( longMVValues );
 390   }
 391
 392   protected void makeIndexLongMVB() throws Exception {
 393     // "*" ... LF
 394
 395     //           1111111111222222222233333333334444444444555555
 396     // 01234567890123456789012345678901234567890123456789012345
 397     // *Lucene/Solr does not require such additional hardware.
 398     //  Lu 0        do 10    re 15   su 21       na 31
 399     //   uc 1        oe 11    eq 16   uc 22       al 32
 400     //    ce 2        es 12    qu 17   ch 23         ha 33
 401     //     en 3          no 13  ui 18     ad 24       ar 34
 402     //      ne 4          ot 14  ir 19     dd 25       rd 35
 403     //       e/ 5                 re 20     di 26       dw 36
 404     //        /S 6                           it 27       wa 37
 405     //         So 7                           ti 28       ar 38
 406     //          ol 8                           io 29       re 39
 407     //           lr 9                           on 30
 408
 409     // 5555666666666677777777778888888888999999999
 410     // 6789012345678901234567890123456789012345678
 411     // *When you talk about processing speed, the
 412     //  Wh 40         ab 48     es 56         th 65
 413     //   he 41         bo 49     ss 57         he 66
 414     //    en 42         ou 50     si 58
 415     //       yo 43       ut 51     in 59
 416     //        ou 44         pr 52   ng 60
 417     //           ta 45       ro 53     sp 61
 418     //            al 46       oc 54     pe 62
 419     //             lk 47       ce 55     ee 63
 420     //                                    ed 64
 421
 422     make1dmfIndexB( biMVValues );
 423   }
 424
 425   protected void makeIndexStrMV() throws Exception {
 426
 427     //  0123
 428     // "abc"
 429
 430     //  34567
 431     // "defg"
 432
 433     //     111
 434     //  789012
 435     // "hijkl"
 436     make1dmfIndexNA( strMVValues );
 437   }
 438 }