1 package org.apache.lucene.search.vectorhighlight;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.Reader;
22 import java.util.Collection;
24 import org.apache.lucene.analysis.Analyzer;
25 import org.apache.lucene.analysis.MockAnalyzer;
26 import org.apache.lucene.analysis.MockTokenizer;
27 import org.apache.lucene.analysis.TokenStream;
28 import org.apache.lucene.analysis.Tokenizer;
29 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
30 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
31 import org.apache.lucene.document.Document;
32 import org.apache.lucene.document.Field;
33 import org.apache.lucene.document.Field.Index;
34 import org.apache.lucene.document.Field.Store;
35 import org.apache.lucene.document.Field.TermVector;
36 import org.apache.lucene.index.IndexReader;
37 import org.apache.lucene.index.IndexWriter;
38 import org.apache.lucene.index.IndexWriterConfig;
39 import org.apache.lucene.index.Term;
40 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
41 import org.apache.lucene.queryParser.QueryParser;
42 import org.apache.lucene.search.DisjunctionMaxQuery;
43 import org.apache.lucene.search.PhraseQuery;
44 import org.apache.lucene.search.Query;
45 import org.apache.lucene.search.TermQuery;
46 import org.apache.lucene.store.Directory;
47 import org.apache.lucene.util.LuceneTestCase;
49 public abstract class AbstractTestCase extends LuceneTestCase {
51 protected final String F = "f";
52 protected final String F1 = "f1";
53 protected final String F2 = "f2";
54 protected Directory dir;
55 protected Analyzer analyzerW;
56 protected Analyzer analyzerB;
57 protected Analyzer analyzerK;
58 protected IndexReader reader;
59 protected QueryParser paW;
60 protected QueryParser paB;
62 protected static final String[] shortMVValues = {
66 "", // empty data in multi valued field
70 protected static final String[] longMVValues = {
71 "Followings are the examples of customizable parameters and actual examples of customization:",
72 "The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically"
75 // test data for LUCENE-1448 bug
76 protected static final String[] biMVValues = {
77 "\nLucene/Solr does not require such additional hardware.",
78 "\nWhen you talk about processing speed, the"
81 protected static final String[] strMVValues = {
88 public void setUp() throws Exception {
90 analyzerW = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
91 analyzerB = new BigramAnalyzer();
92 analyzerK = new MockAnalyzer(random, MockTokenizer.KEYWORD, false);
93 paW = new QueryParser(TEST_VERSION_CURRENT, F, analyzerW );
94 paB = new QueryParser(TEST_VERSION_CURRENT, F, analyzerB );
99 public void tearDown() throws Exception {
100 if( reader != null ){
108 protected Query tq( String text ){
109 return tq( 1F, text );
112 protected Query tq( float boost, String text ){
113 return tq( boost, F, text );
116 protected Query tq( String field, String text ){
117 return tq( 1F, field, text );
120 protected Query tq( float boost, String field, String text ){
121 Query query = new TermQuery( new Term( field, text ) );
122 query.setBoost( boost );
126 protected Query pqF( String... texts ){
127 return pqF( 1F, texts );
130 protected Query pqF( float boost, String... texts ){
131 return pqF( boost, 0, texts );
134 protected Query pqF( float boost, int slop, String... texts ){
135 return pq( boost, slop, F, texts );
138 protected Query pq( String field, String... texts ){
139 return pq( 1F, 0, field, texts );
142 protected Query pq( float boost, String field, String... texts ){
143 return pq( boost, 0, field, texts );
146 protected Query pq( float boost, int slop, String field, String... texts ){
147 PhraseQuery query = new PhraseQuery();
148 for( String text : texts ){
149 query.add( new Term( field, text ) );
151 query.setBoost( boost );
152 query.setSlop( slop );
156 protected Query dmq( Query... queries ){
157 return dmq( 0.0F, queries );
160 protected Query dmq( float tieBreakerMultiplier, Query... queries ){
161 DisjunctionMaxQuery query = new DisjunctionMaxQuery( tieBreakerMultiplier );
162 for( Query q : queries ){
168 protected void assertCollectionQueries( Collection<Query> actual, Query... expected ){
169 assertEquals( expected.length, actual.size() );
170 for( Query query : expected ){
171 assertTrue( actual.contains( query ) );
175 static final class BigramAnalyzer extends Analyzer {
177 public TokenStream tokenStream(String fieldName, Reader reader) {
178 return new BasicNGramTokenizer( reader );
182 static final class BasicNGramTokenizer extends Tokenizer {
184 public static final int DEFAULT_N_SIZE = 2;
185 public static final String DEFAULT_DELIMITERS = " \t\n.,";
187 private final String delimiters;
188 private int startTerm;
190 private int startOffset;
191 private int nextStartOffset;
193 private String snippet;
194 private StringBuilder snippetBuffer;
195 private static final int BUFFER_SIZE = 4096;
196 private char[] charBuffer;
197 private int charBufferIndex;
198 private int charBufferLen;
200 public BasicNGramTokenizer( Reader in ){
201 this( in, DEFAULT_N_SIZE );
204 public BasicNGramTokenizer( Reader in, int n ){
205 this( in, n, DEFAULT_DELIMITERS );
208 public BasicNGramTokenizer( Reader in, String delimiters ){
209 this( in, DEFAULT_N_SIZE, delimiters );
212 public BasicNGramTokenizer( Reader in, int n, String delimiters ){
215 this.delimiters = delimiters;
219 snippetBuffer = new StringBuilder();
220 charBuffer = new char[BUFFER_SIZE];
221 charBufferIndex = BUFFER_SIZE;
226 CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
227 OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
229 public boolean incrementToken() throws IOException {
230 if( !getNextPartialSnippet() )
233 termAtt.setEmpty().append(snippet, startTerm, startTerm + lenTerm);
234 offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm));
238 private int getFinalOffset() {
239 return nextStartOffset;
243 public final void end(){
244 offsetAtt.setOffset(getFinalOffset(),getFinalOffset());
247 protected boolean getNextPartialSnippet() throws IOException {
248 if( snippet != null && snippet.length() >= startTerm + 1 + n ){
254 return getNextSnippet();
257 protected boolean getNextSnippet() throws IOException {
259 startOffset = nextStartOffset;
260 snippetBuffer.delete( 0, snippetBuffer.length() );
263 ch = readCharFromBuffer();
264 if( ch == -1 ) break;
265 else if( !isDelimiter( ch ) )
266 snippetBuffer.append( (char)ch );
267 else if( snippetBuffer.length() > 0 )
272 if( snippetBuffer.length() == 0 )
274 snippet = snippetBuffer.toString();
275 lenTerm = snippet.length() >= n ? n : snippet.length();
279 protected int readCharFromBuffer() throws IOException {
280 if( charBufferIndex >= charBufferLen ){
281 charBufferLen = input.read( charBuffer );
282 if( charBufferLen == -1 ){
287 int c = charBuffer[charBufferIndex++];
292 protected boolean isDelimiter( int c ){
293 return delimiters.indexOf( c ) >= 0;
297 public void reset( Reader input ) throws IOException {
298 super.reset( input );
303 public void reset() throws IOException {
307 snippetBuffer.setLength( 0 );
308 charBufferIndex = BUFFER_SIZE;
314 protected void make1d1fIndex( String value ) throws Exception {
315 make1dmfIndex( value );
318 protected void make1d1fIndexB( String value ) throws Exception {
319 make1dmfIndexB( value );
322 protected void make1dmfIndex( String... values ) throws Exception {
323 make1dmfIndex( analyzerW, values );
326 protected void make1dmfIndexB( String... values ) throws Exception {
327 make1dmfIndex( analyzerB, values );
330 // make 1 doc with multi valued field
331 protected void make1dmfIndex( Analyzer analyzer, String... values ) throws Exception {
332 IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
333 TEST_VERSION_CURRENT, analyzer).setOpenMode(OpenMode.CREATE));
334 Document doc = new Document();
335 for( String value: values )
336 doc.add( new Field( F, value, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
337 writer.addDocument( doc );
339 if (reader != null) reader.close();
340 reader = IndexReader.open( dir, true );
343 // make 1 doc with multi valued & not analyzed field
344 protected void make1dmfIndexNA( String... values ) throws Exception {
345 IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
346 TEST_VERSION_CURRENT, analyzerK).setOpenMode(OpenMode.CREATE));
347 Document doc = new Document();
348 for( String value: values )
349 doc.add( new Field( F, value, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
350 writer.addDocument( doc );
352 if (reader != null) reader.close();
353 reader = IndexReader.open( dir, true );
356 protected void makeIndexShortMV() throws Exception {
374 make1dmfIndex( shortMVValues );
377 protected void makeIndexLongMV() throws Exception {
378 // 11111111112222222222333333333344444444445555555555666666666677777777778888888888999
379 // 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
380 // Followings are the examples of customizable parameters and actual examples of customization:
381 // 0 1 2 3 4 5 6 7 8 9 10 11
384 // 999999900000000001111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000111111111122
385 // 345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901
386 // The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically
387 // 12 13 (14) (15) 16 17 18 19 20 21 22 23 (24) (25) 26 27 28 29 30 31 32 33 34
389 make1dmfIndex( longMVValues );
392 protected void makeIndexLongMVB() throws Exception {
395 // 1111111111222222222233333333334444444444555555
396 // 01234567890123456789012345678901234567890123456789012345
397 // *Lucene/Solr does not require such additional hardware.
398 // Lu 0 do 10 re 15 su 21 na 31
399 // uc 1 oe 11 eq 16 uc 22 al 32
400 // ce 2 es 12 qu 17 ch 23 ha 33
401 // en 3 no 13 ui 18 ad 24 ar 34
402 // ne 4 ot 14 ir 19 dd 25 rd 35
403 // e/ 5 re 20 di 26 dw 36
409 // 5555666666666677777777778888888888999999999
410 // 6789012345678901234567890123456789012345678
411 // *When you talk about processing speed, the
412 // Wh 40 ab 48 es 56 th 65
413 // he 41 bo 49 ss 57 he 66
422 make1dmfIndexB( biMVValues );
425 protected void makeIndexStrMV() throws Exception {
436 make1dmfIndexNA( strMVValues );