pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / highlighter / src / test / org / apache / lucene / search / vectorhighlight / AbstractTestCase.java
diff --git a/lucene-java-3.5.0/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java b/lucene-java-3.5.0/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java

new file mode 100644 (file)

index 0000000..0f19ebf
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java
@@ -0,0 +1,438 @@
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Collection;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.Field.TermVector;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.DisjunctionMaxQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+public abstract class AbstractTestCase extends LuceneTestCase {
+
+  protected final String F = "f";
+  protected final String F1 = "f1";
+  protected final String F2 = "f2";
+  protected Directory dir;
+  protected Analyzer analyzerW;
+  protected Analyzer analyzerB;
+  protected Analyzer analyzerK;
+  protected IndexReader reader;  
+  protected QueryParser paW;
+  protected QueryParser paB;
+  
+  protected static final String[] shortMVValues = {
+    "",
+    "",
+    "a b c",
+    "",   // empty data in multi valued field
+    "d e"
+  };
+  
+  protected static final String[] longMVValues = {
+    "Followings are the examples of customizable parameters and actual examples of customization:",
+    "The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically"
+  };
+  
+  // test data for LUCENE-1448 bug
+  protected static final String[] biMVValues = {
+    "\nLucene/Solr does not require such additional hardware.",
+    "\nWhen you talk about processing speed, the"
+  };
+  
+  protected static final String[] strMVValues = {
+    "abc",
+    "defg",
+    "hijkl"
+  };
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    analyzerW = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
+    analyzerB = new BigramAnalyzer();
+    analyzerK = new MockAnalyzer(random, MockTokenizer.KEYWORD, false);
+    paW = new QueryParser(TEST_VERSION_CURRENT,  F, analyzerW );
+    paB = new QueryParser(TEST_VERSION_CURRENT,  F, analyzerB );
+    dir = newDirectory();
+  }
+  
+  @Override
+  public void tearDown() throws Exception {
+    if( reader != null ){
+      reader.close();
+      reader = null;
+    }
+    dir.close();
+    super.tearDown();
+  }
+
+  protected Query tq( String text ){
+    return tq( 1F, text );
+  }
+
+  protected Query tq( float boost, String text ){
+    return tq( boost, F, text );
+  }
+  
+  protected Query tq( String field, String text ){
+    return tq( 1F, field, text );
+  }
+  
+  protected Query tq( float boost, String field, String text ){
+    Query query = new TermQuery( new Term( field, text ) );
+    query.setBoost( boost );
+    return query;
+  }
+  
+  protected Query pqF( String... texts ){
+    return pqF( 1F, texts );
+  }
+  
+  protected Query pqF( float boost, String... texts ){
+    return pqF( boost, 0, texts );
+  }
+  
+  protected Query pqF( float boost, int slop, String... texts ){
+    return pq( boost, slop, F, texts );
+  }
+  
+  protected Query pq( String field, String... texts ){
+    return pq( 1F, 0, field, texts );
+  }
+  
+  protected Query pq( float boost, String field, String... texts ){
+    return pq( boost, 0, field, texts );
+  }
+  
+  protected Query pq( float boost, int slop, String field, String... texts ){
+    PhraseQuery query = new PhraseQuery();
+    for( String text : texts ){
+      query.add( new Term( field, text ) );
+    }
+    query.setBoost( boost );
+    query.setSlop( slop );
+    return query;
+  }
+  
+  protected Query dmq( Query... queries ){
+    return dmq( 0.0F, queries );
+  }
+  
+  protected Query dmq( float tieBreakerMultiplier, Query... queries ){
+    DisjunctionMaxQuery query = new DisjunctionMaxQuery( tieBreakerMultiplier );
+    for( Query q : queries ){
+      query.add( q );
+    }
+    return query;
+  }
+  
+  protected void assertCollectionQueries( Collection<Query> actual, Query... expected ){
+    assertEquals( expected.length, actual.size() );
+    for( Query query : expected ){
+      assertTrue( actual.contains( query ) );
+    }
+  }
+
+  static final class BigramAnalyzer extends Analyzer {
+    @Override
+    public TokenStream tokenStream(String fieldName, Reader reader) {
+      return new BasicNGramTokenizer( reader );
+    }
+  }
+  
+  static final class BasicNGramTokenizer extends Tokenizer {
+
+    public static final int DEFAULT_N_SIZE = 2;
+    public static final String DEFAULT_DELIMITERS = " \t\n.,";
+    private final int n;
+    private final String delimiters;
+    private int startTerm;
+    private int lenTerm;
+    private int startOffset;
+    private int nextStartOffset;
+    private int ch;
+    private String snippet;
+    private StringBuilder snippetBuffer;
+    private static final int BUFFER_SIZE = 4096;
+    private char[] charBuffer;
+    private int charBufferIndex;
+    private int charBufferLen;
+    
+    public BasicNGramTokenizer( Reader in ){
+      this( in, DEFAULT_N_SIZE );
+    }
+    
+    public BasicNGramTokenizer( Reader in, int n ){
+      this( in, n, DEFAULT_DELIMITERS );
+    }
+    
+    public BasicNGramTokenizer( Reader in, String delimiters ){
+      this( in, DEFAULT_N_SIZE, delimiters );
+    }
+    
+    public BasicNGramTokenizer( Reader in, int n, String delimiters ){
+      super(in);
+      this.n = n;
+      this.delimiters = delimiters;
+      startTerm = 0;
+      nextStartOffset = 0;
+      snippet = null;
+      snippetBuffer = new StringBuilder();
+      charBuffer = new char[BUFFER_SIZE];
+      charBufferIndex = BUFFER_SIZE;
+      charBufferLen = 0;
+      ch = 0;
+    }
+
+    CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+    @Override
+    public boolean incrementToken() throws IOException {
+      if( !getNextPartialSnippet() )
+        return false;
+      clearAttributes();
+      termAtt.setEmpty().append(snippet, startTerm, startTerm + lenTerm);
+      offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm));
+      return true;
+    }
+
+    private int getFinalOffset() {
+      return nextStartOffset;
+    }
+    
+    @Override
+    public final void end(){
+      offsetAtt.setOffset(getFinalOffset(),getFinalOffset());
+    }
+    
+    protected boolean getNextPartialSnippet() throws IOException {
+      if( snippet != null && snippet.length() >= startTerm + 1 + n ){
+        startTerm++;
+        startOffset++;
+        lenTerm = n;
+        return true;
+      }
+      return getNextSnippet();
+    }
+    
+    protected boolean getNextSnippet() throws IOException {
+      startTerm = 0;
+      startOffset = nextStartOffset;
+      snippetBuffer.delete( 0, snippetBuffer.length() );
+      while( true ){
+        if( ch != -1 )
+          ch = readCharFromBuffer();
+        if( ch == -1 ) break;
+        else if( !isDelimiter( ch ) )
+          snippetBuffer.append( (char)ch );
+        else if( snippetBuffer.length() > 0 )
+          break;
+        else
+          startOffset++;
+      }
+      if( snippetBuffer.length() == 0 )
+        return false;
+      snippet = snippetBuffer.toString();
+      lenTerm = snippet.length() >= n ? n : snippet.length();
+      return true;
+    }
+    
+    protected int readCharFromBuffer() throws IOException {
+      if( charBufferIndex >= charBufferLen ){
+        charBufferLen = input.read( charBuffer );
+        if( charBufferLen == -1 ){
+          return -1;
+        }
+        charBufferIndex = 0;
+      }
+      int c = charBuffer[charBufferIndex++];
+      nextStartOffset++;
+      return c;
+    }
+    
+    protected boolean isDelimiter( int c ){
+      return delimiters.indexOf( c ) >= 0;
+    }
+    
+    @Override
+    public void reset( Reader input ) throws IOException {
+      super.reset( input );
+      reset();
+    }
+    
+    @Override
+    public void reset() throws IOException {
+      startTerm = 0;
+      nextStartOffset = 0;
+      snippet = null;
+      snippetBuffer.setLength( 0 );
+      charBufferIndex = BUFFER_SIZE;
+      charBufferLen = 0;
+      ch = 0;
+    }
+  }
+
+  protected void make1d1fIndex( String value ) throws Exception {
+    make1dmfIndex( value );
+  }
+  
+  protected void make1d1fIndexB( String value ) throws Exception {
+    make1dmfIndexB( value );
+  }
+  
+  protected void make1dmfIndex( String... values ) throws Exception {
+    make1dmfIndex( analyzerW, values );
+  }
+  
+  protected void make1dmfIndexB( String... values ) throws Exception {
+    make1dmfIndex( analyzerB, values );
+  }
+  
+  // make 1 doc with multi valued field
+  protected void make1dmfIndex( Analyzer analyzer, String... values ) throws Exception {
+    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
+        TEST_VERSION_CURRENT, analyzer).setOpenMode(OpenMode.CREATE));
+    Document doc = new Document();
+    for( String value: values )
+      doc.add( new Field( F, value, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
+    writer.addDocument( doc );
+    writer.close();
+    if (reader != null) reader.close();
+    reader = IndexReader.open( dir, true );
+  }
+  
+  // make 1 doc with multi valued & not analyzed field
+  protected void make1dmfIndexNA( String... values ) throws Exception {
+    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
+        TEST_VERSION_CURRENT, analyzerK).setOpenMode(OpenMode.CREATE));
+    Document doc = new Document();
+    for( String value: values )
+      doc.add( new Field( F, value, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
+    writer.addDocument( doc );
+    writer.close();
+    if (reader != null) reader.close();
+    reader = IndexReader.open( dir, true );
+  }
+  
+  protected void makeIndexShortMV() throws Exception {
+    
+    //  0
+    // ""
+    //  1
+    // ""
+
+    //  234567
+    // "a b c"
+    //  0 1 2
+
+    //  8
+    // ""
+
+    //   111
+    //  9012
+    // "d e"
+    //  3 4
+    make1dmfIndex( shortMVValues );
+  }
+  
+  protected void makeIndexLongMV() throws Exception {
+    //           11111111112222222222333333333344444444445555555555666666666677777777778888888888999
+    // 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
+    // Followings are the examples of customizable parameters and actual examples of customization:
+    // 0          1   2   3        4  5            6          7   8      9        10 11
+    
+    //        1                                                                                                   2
+    // 999999900000000001111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000111111111122
+    // 345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901
+    // The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically
+    // 12  13  (14)   (15)     16  17   18  19 20    21       22   23 (24)   (25)     26   27   28   29  30  31  32   33      34
+
+    make1dmfIndex( longMVValues );
+  }
+  
+  protected void makeIndexLongMVB() throws Exception {
+    // "*" ... LF
+    
+    //           1111111111222222222233333333334444444444555555
+    // 01234567890123456789012345678901234567890123456789012345
+    // *Lucene/Solr does not require such additional hardware.
+    //  Lu 0        do 10    re 15   su 21       na 31
+    //   uc 1        oe 11    eq 16   uc 22       al 32
+    //    ce 2        es 12    qu 17   ch 23         ha 33
+    //     en 3          no 13  ui 18     ad 24       ar 34
+    //      ne 4          ot 14  ir 19     dd 25       rd 35
+    //       e/ 5                 re 20     di 26       dw 36
+    //        /S 6                           it 27       wa 37
+    //         So 7                           ti 28       ar 38
+    //          ol 8                           io 29       re 39
+    //           lr 9                           on 30
+
+    // 5555666666666677777777778888888888999999999
+    // 6789012345678901234567890123456789012345678
+    // *When you talk about processing speed, the
+    //  Wh 40         ab 48     es 56         th 65
+    //   he 41         bo 49     ss 57         he 66
+    //    en 42         ou 50     si 58
+    //       yo 43       ut 51     in 59
+    //        ou 44         pr 52   ng 60
+    //           ta 45       ro 53     sp 61
+    //            al 46       oc 54     pe 62
+    //             lk 47       ce 55     ee 63
+    //                                    ed 64
+
+    make1dmfIndexB( biMVValues );
+  }
+  
+  protected void makeIndexStrMV() throws Exception {
+
+    //  0123
+    // "abc"
+    
+    //  34567
+    // "defg"
+
+    //     111
+    //  789012
+    // "hijkl"
+    make1dmfIndexNA( strMVValues );
+  }
+}