--- /dev/null
+package org.apache.lucene.search.vectorhighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Collection;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.Field.TermVector;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.DisjunctionMaxQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+public abstract class AbstractTestCase extends LuceneTestCase {
+
+ protected final String F = "f";
+ protected final String F1 = "f1";
+ protected final String F2 = "f2";
+ protected Directory dir;
+ protected Analyzer analyzerW;
+ protected Analyzer analyzerB;
+ protected Analyzer analyzerK;
+ protected IndexReader reader;
+ protected QueryParser paW;
+ protected QueryParser paB;
+
+ protected static final String[] shortMVValues = {
+ "",
+ "",
+ "a b c",
+ "", // empty data in multi valued field
+ "d e"
+ };
+
+ protected static final String[] longMVValues = {
+ "Followings are the examples of customizable parameters and actual examples of customization:",
+ "The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically"
+ };
+
+ // test data for LUCENE-1448 bug
+ protected static final String[] biMVValues = {
+ "\nLucene/Solr does not require such additional hardware.",
+ "\nWhen you talk about processing speed, the"
+ };
+
+ protected static final String[] strMVValues = {
+ "abc",
+ "defg",
+ "hijkl"
+ };
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ analyzerW = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
+ analyzerB = new BigramAnalyzer();
+ analyzerK = new MockAnalyzer(random, MockTokenizer.KEYWORD, false);
+ paW = new QueryParser(TEST_VERSION_CURRENT, F, analyzerW );
+ paB = new QueryParser(TEST_VERSION_CURRENT, F, analyzerB );
+ dir = newDirectory();
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ if( reader != null ){
+ reader.close();
+ reader = null;
+ }
+ dir.close();
+ super.tearDown();
+ }
+
+ protected Query tq( String text ){
+ return tq( 1F, text );
+ }
+
+ protected Query tq( float boost, String text ){
+ return tq( boost, F, text );
+ }
+
+ protected Query tq( String field, String text ){
+ return tq( 1F, field, text );
+ }
+
+ protected Query tq( float boost, String field, String text ){
+ Query query = new TermQuery( new Term( field, text ) );
+ query.setBoost( boost );
+ return query;
+ }
+
+ protected Query pqF( String... texts ){
+ return pqF( 1F, texts );
+ }
+
+ protected Query pqF( float boost, String... texts ){
+ return pqF( boost, 0, texts );
+ }
+
+ protected Query pqF( float boost, int slop, String... texts ){
+ return pq( boost, slop, F, texts );
+ }
+
+ protected Query pq( String field, String... texts ){
+ return pq( 1F, 0, field, texts );
+ }
+
+ protected Query pq( float boost, String field, String... texts ){
+ return pq( boost, 0, field, texts );
+ }
+
+ protected Query pq( float boost, int slop, String field, String... texts ){
+ PhraseQuery query = new PhraseQuery();
+ for( String text : texts ){
+ query.add( new Term( field, text ) );
+ }
+ query.setBoost( boost );
+ query.setSlop( slop );
+ return query;
+ }
+
+ protected Query dmq( Query... queries ){
+ return dmq( 0.0F, queries );
+ }
+
+ protected Query dmq( float tieBreakerMultiplier, Query... queries ){
+ DisjunctionMaxQuery query = new DisjunctionMaxQuery( tieBreakerMultiplier );
+ for( Query q : queries ){
+ query.add( q );
+ }
+ return query;
+ }
+
+ protected void assertCollectionQueries( Collection<Query> actual, Query... expected ){
+ assertEquals( expected.length, actual.size() );
+ for( Query query : expected ){
+ assertTrue( actual.contains( query ) );
+ }
+ }
+
+ static final class BigramAnalyzer extends Analyzer {
+ @Override
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return new BasicNGramTokenizer( reader );
+ }
+ }
+
+ static final class BasicNGramTokenizer extends Tokenizer {
+
+ public static final int DEFAULT_N_SIZE = 2;
+ public static final String DEFAULT_DELIMITERS = " \t\n.,";
+ private final int n;
+ private final String delimiters;
+ private int startTerm;
+ private int lenTerm;
+ private int startOffset;
+ private int nextStartOffset;
+ private int ch;
+ private String snippet;
+ private StringBuilder snippetBuffer;
+ private static final int BUFFER_SIZE = 4096;
+ private char[] charBuffer;
+ private int charBufferIndex;
+ private int charBufferLen;
+
+ public BasicNGramTokenizer( Reader in ){
+ this( in, DEFAULT_N_SIZE );
+ }
+
+ public BasicNGramTokenizer( Reader in, int n ){
+ this( in, n, DEFAULT_DELIMITERS );
+ }
+
+ public BasicNGramTokenizer( Reader in, String delimiters ){
+ this( in, DEFAULT_N_SIZE, delimiters );
+ }
+
+ public BasicNGramTokenizer( Reader in, int n, String delimiters ){
+ super(in);
+ this.n = n;
+ this.delimiters = delimiters;
+ startTerm = 0;
+ nextStartOffset = 0;
+ snippet = null;
+ snippetBuffer = new StringBuilder();
+ charBuffer = new char[BUFFER_SIZE];
+ charBufferIndex = BUFFER_SIZE;
+ charBufferLen = 0;
+ ch = 0;
+ }
+
+ CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ @Override
+ public boolean incrementToken() throws IOException {
+ if( !getNextPartialSnippet() )
+ return false;
+ clearAttributes();
+ termAtt.setEmpty().append(snippet, startTerm, startTerm + lenTerm);
+ offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm));
+ return true;
+ }
+
+ private int getFinalOffset() {
+ return nextStartOffset;
+ }
+
+ @Override
+ public final void end(){
+ offsetAtt.setOffset(getFinalOffset(),getFinalOffset());
+ }
+
+ protected boolean getNextPartialSnippet() throws IOException {
+ if( snippet != null && snippet.length() >= startTerm + 1 + n ){
+ startTerm++;
+ startOffset++;
+ lenTerm = n;
+ return true;
+ }
+ return getNextSnippet();
+ }
+
+ protected boolean getNextSnippet() throws IOException {
+ startTerm = 0;
+ startOffset = nextStartOffset;
+ snippetBuffer.delete( 0, snippetBuffer.length() );
+ while( true ){
+ if( ch != -1 )
+ ch = readCharFromBuffer();
+ if( ch == -1 ) break;
+ else if( !isDelimiter( ch ) )
+ snippetBuffer.append( (char)ch );
+ else if( snippetBuffer.length() > 0 )
+ break;
+ else
+ startOffset++;
+ }
+ if( snippetBuffer.length() == 0 )
+ return false;
+ snippet = snippetBuffer.toString();
+ lenTerm = snippet.length() >= n ? n : snippet.length();
+ return true;
+ }
+
+ protected int readCharFromBuffer() throws IOException {
+ if( charBufferIndex >= charBufferLen ){
+ charBufferLen = input.read( charBuffer );
+ if( charBufferLen == -1 ){
+ return -1;
+ }
+ charBufferIndex = 0;
+ }
+ int c = charBuffer[charBufferIndex++];
+ nextStartOffset++;
+ return c;
+ }
+
+ protected boolean isDelimiter( int c ){
+ return delimiters.indexOf( c ) >= 0;
+ }
+
+ @Override
+ public void reset( Reader input ) throws IOException {
+ super.reset( input );
+ reset();
+ }
+
+ @Override
+ public void reset() throws IOException {
+ startTerm = 0;
+ nextStartOffset = 0;
+ snippet = null;
+ snippetBuffer.setLength( 0 );
+ charBufferIndex = BUFFER_SIZE;
+ charBufferLen = 0;
+ ch = 0;
+ }
+ }
+
+ protected void make1d1fIndex( String value ) throws Exception {
+ make1dmfIndex( value );
+ }
+
+ protected void make1d1fIndexB( String value ) throws Exception {
+ make1dmfIndexB( value );
+ }
+
+ protected void make1dmfIndex( String... values ) throws Exception {
+ make1dmfIndex( analyzerW, values );
+ }
+
+ protected void make1dmfIndexB( String... values ) throws Exception {
+ make1dmfIndex( analyzerB, values );
+ }
+
+ // make 1 doc with multi valued field
+ protected void make1dmfIndex( Analyzer analyzer, String... values ) throws Exception {
+ IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
+ TEST_VERSION_CURRENT, analyzer).setOpenMode(OpenMode.CREATE));
+ Document doc = new Document();
+ for( String value: values )
+ doc.add( new Field( F, value, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
+ writer.addDocument( doc );
+ writer.close();
+ if (reader != null) reader.close();
+ reader = IndexReader.open( dir, true );
+ }
+
+ // make 1 doc with multi valued & not analyzed field
+ protected void make1dmfIndexNA( String... values ) throws Exception {
+ IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
+ TEST_VERSION_CURRENT, analyzerK).setOpenMode(OpenMode.CREATE));
+ Document doc = new Document();
+ for( String value: values )
+ doc.add( new Field( F, value, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
+ writer.addDocument( doc );
+ writer.close();
+ if (reader != null) reader.close();
+ reader = IndexReader.open( dir, true );
+ }
+
+ protected void makeIndexShortMV() throws Exception {
+
+ // 0
+ // ""
+ // 1
+ // ""
+
+ // 234567
+ // "a b c"
+ // 0 1 2
+
+ // 8
+ // ""
+
+ // 111
+ // 9012
+ // "d e"
+ // 3 4
+ make1dmfIndex( shortMVValues );
+ }
+
+ protected void makeIndexLongMV() throws Exception {
+ // 11111111112222222222333333333344444444445555555555666666666677777777778888888888999
+ // 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
+ // Followings are the examples of customizable parameters and actual examples of customization:
+ // 0 1 2 3 4 5 6 7 8 9 10 11
+
+ // 1 2
+ // 999999900000000001111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000111111111122
+ // 345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901
+ // The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically
+ // 12 13 (14) (15) 16 17 18 19 20 21 22 23 (24) (25) 26 27 28 29 30 31 32 33 34
+
+ make1dmfIndex( longMVValues );
+ }
+
+ protected void makeIndexLongMVB() throws Exception {
+ // "*" ... LF
+
+ // 1111111111222222222233333333334444444444555555
+ // 01234567890123456789012345678901234567890123456789012345
+ // *Lucene/Solr does not require such additional hardware.
+ // Lu 0 do 10 re 15 su 21 na 31
+ // uc 1 oe 11 eq 16 uc 22 al 32
+ // ce 2 es 12 qu 17 ch 23 ha 33
+ // en 3 no 13 ui 18 ad 24 ar 34
+ // ne 4 ot 14 ir 19 dd 25 rd 35
+ // e/ 5 re 20 di 26 dw 36
+ // /S 6 it 27 wa 37
+ // So 7 ti 28 ar 38
+ // ol 8 io 29 re 39
+ // lr 9 on 30
+
+ // 5555666666666677777777778888888888999999999
+ // 6789012345678901234567890123456789012345678
+ // *When you talk about processing speed, the
+ // Wh 40 ab 48 es 56 th 65
+ // he 41 bo 49 ss 57 he 66
+ // en 42 ou 50 si 58
+ // yo 43 ut 51 in 59
+ // ou 44 pr 52 ng 60
+ // ta 45 ro 53 sp 61
+ // al 46 oc 54 pe 62
+ // lk 47 ce 55 ee 63
+ // ed 64
+
+ make1dmfIndexB( biMVValues );
+ }
+
+ protected void makeIndexStrMV() throws Exception {
+
+ // 0123
+ // "abc"
+
+ // 34567
+ // "defg"
+
+ // 111
+ // 789012
+ // "hijkl"
+ make1dmfIndexNA( strMVValues );
+ }
+}