lucene-java-3.5.0/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java

   1 package org.apache.lucene.search.highlight;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.Reader;
  22
  23 import org.apache.lucene.analysis.Analyzer;
  24 import org.apache.lucene.analysis.Token;
  25 import org.apache.lucene.analysis.TokenStream;
  26 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  27 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  28 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  29 import org.apache.lucene.document.Document;
  30 import org.apache.lucene.document.Field;
  31 import org.apache.lucene.document.Field.TermVector;
  32 import org.apache.lucene.index.CorruptIndexException;
  33 import org.apache.lucene.index.IndexReader;
  34 import org.apache.lucene.index.IndexWriter;
  35 import org.apache.lucene.index.Term;
  36 import org.apache.lucene.index.TermPositionVector;
  37 import org.apache.lucene.search.DisjunctionMaxQuery;
  38 import org.apache.lucene.search.IndexSearcher;
  39 import org.apache.lucene.search.Query;
  40 import org.apache.lucene.search.TopDocs;
  41 import org.apache.lucene.search.spans.SpanNearQuery;
  42 import org.apache.lucene.search.spans.SpanQuery;
  43 import org.apache.lucene.search.spans.SpanTermQuery;
  44 import org.apache.lucene.store.Directory;
  45 import org.apache.lucene.store.LockObtainFailedException;
  46 import org.apache.lucene.util.LuceneTestCase;
  47
  48 // LUCENE-2874
  49 public class TokenSourcesTest extends LuceneTestCase {
  50   private static final String FIELD = "text";
  51
  52   private static final class OverlapAnalyzer extends Analyzer {
  53
  54     @Override
  55     public TokenStream tokenStream(String fieldName, Reader reader) {
  56       return new TokenStreamOverlap();
  57     }
  58   }
  59
  60   private static final class TokenStreamOverlap extends TokenStream {
  61     private Token[] tokens;
  62
  63     private int i = -1;
  64
  65     private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
  66     private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
  67     private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
  68
  69     public TokenStreamOverlap() {
  70       reset();
  71     }
  72
  73     @Override
  74     public boolean incrementToken() throws IOException {
  75       this.i++;
  76       if (this.i >= this.tokens.length) {
  77         return false;
  78       }
  79       clearAttributes();
  80       termAttribute.setEmpty().append(this.tokens[i]);
  81       offsetAttribute.setOffset(this.tokens[i].startOffset(),
  82           this.tokens[i].endOffset());
  83       positionIncrementAttribute.setPositionIncrement(this.tokens[i]
  84           .getPositionIncrement());
  85       return true;
  86     }
  87
  88     @Override
  89     public void reset() {
  90       this.i = -1;
  91       this.tokens = new Token[] {
  92           new Token(new char[] {'t', 'h', 'e'}, 0, 3, 0, 3),
  93           new Token(new char[] {'{', 'f', 'o', 'x', '}'}, 0, 5, 0, 7),
  94           new Token(new char[] {'f', 'o', 'x'}, 0, 3, 4, 7),
  95           new Token(new char[] {'d', 'i', 'd'}, 0, 3, 8, 11),
  96           new Token(new char[] {'n', 'o', 't'}, 0, 3, 12, 15),
  97           new Token(new char[] {'j', 'u', 'm', 'p'}, 0, 4, 16, 20)};
  98       this.tokens[1].setPositionIncrement(0);
  99     }
 100   }
 101
 102   public void testOverlapWithOffset() throws CorruptIndexException,
 103       LockObtainFailedException, IOException, InvalidTokenOffsetsException {
 104     final String TEXT = "the fox did not jump";
 105     final Directory directory = newDirectory();
 106     final IndexWriter indexWriter = new IndexWriter(directory,
 107         newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
 108     try {
 109       final Document document = new Document();
 110       document.add(new Field(FIELD, new TokenStreamOverlap(),
 111           TermVector.WITH_OFFSETS));
 112       indexWriter.addDocument(document);
 113     } finally {
 114       indexWriter.close();
 115     }
 116     final IndexReader indexReader = IndexReader.open(directory, true);
 117     try {
 118       assertEquals(1, indexReader.numDocs());
 119       final IndexSearcher indexSearcher = newSearcher(indexReader);
 120       try {
 121         final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
 122         query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
 123         query.add(new SpanTermQuery(new Term(FIELD, "fox")));
 124         // final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
 125         // new SpanTermQuery(new Term(FIELD, "{fox}")),
 126         // new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
 127
 128         TopDocs hits = indexSearcher.search(query, 1);
 129         assertEquals(1, hits.totalHits);
 130         final Highlighter highlighter = new Highlighter(
 131             new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
 132             new QueryScorer(query));
 133         final TokenStream tokenStream = TokenSources
 134             .getTokenStream(
 135                 (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
 136                 false);
 137         assertEquals("<B>the fox</B> did not jump",
 138             highlighter.getBestFragment(tokenStream, TEXT));
 139       } finally {
 140         indexSearcher.close();
 141       }
 142     } finally {
 143       indexReader.close();
 144       directory.close();
 145     }
 146   }
 147
 148   public void testOverlapWithPositionsAndOffset() throws CorruptIndexException,
 149       LockObtainFailedException, IOException, InvalidTokenOffsetsException {
 150     final String TEXT = "the fox did not jump";
 151     final Directory directory = newDirectory();
 152     final IndexWriter indexWriter = new IndexWriter(directory,
 153         newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
 154     try {
 155       final Document document = new Document();
 156       document.add(new Field(FIELD, new TokenStreamOverlap(),
 157           TermVector.WITH_POSITIONS_OFFSETS));
 158       indexWriter.addDocument(document);
 159     } finally {
 160       indexWriter.close();
 161     }
 162     final IndexReader indexReader = IndexReader.open(directory, true);
 163     try {
 164       assertEquals(1, indexReader.numDocs());
 165       final IndexSearcher indexSearcher = newSearcher(indexReader);
 166       try {
 167         final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
 168         query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
 169         query.add(new SpanTermQuery(new Term(FIELD, "fox")));
 170         // final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
 171         // new SpanTermQuery(new Term(FIELD, "{fox}")),
 172         // new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
 173
 174         TopDocs hits = indexSearcher.search(query, 1);
 175         assertEquals(1, hits.totalHits);
 176         final Highlighter highlighter = new Highlighter(
 177             new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
 178             new QueryScorer(query));
 179         final TokenStream tokenStream = TokenSources
 180             .getTokenStream(
 181                 (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
 182                 false);
 183         assertEquals("<B>the fox</B> did not jump",
 184             highlighter.getBestFragment(tokenStream, TEXT));
 185       } finally {
 186         indexSearcher.close();
 187       }
 188     } finally {
 189       indexReader.close();
 190       directory.close();
 191     }
 192   }
 193
 194   public void testOverlapWithOffsetExactPhrase() throws CorruptIndexException,
 195       LockObtainFailedException, IOException, InvalidTokenOffsetsException {
 196     final String TEXT = "the fox did not jump";
 197     final Directory directory = newDirectory();
 198     final IndexWriter indexWriter = new IndexWriter(directory,
 199         newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
 200     try {
 201       final Document document = new Document();
 202       document.add(new Field(FIELD, new TokenStreamOverlap(),
 203           TermVector.WITH_OFFSETS));
 204       indexWriter.addDocument(document);
 205     } finally {
 206       indexWriter.close();
 207     }
 208     final IndexReader indexReader = IndexReader.open(directory, true);
 209     try {
 210       assertEquals(1, indexReader.numDocs());
 211       final IndexSearcher indexSearcher = newSearcher(indexReader);
 212       try {
 213         // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
 214         // query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
 215         // query.add(new SpanTermQuery(new Term(FIELD, "fox")));
 216         final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
 217             new SpanTermQuery(new Term(FIELD, "the")),
 218             new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);
 219
 220         TopDocs hits = indexSearcher.search(phraseQuery, 1);
 221         assertEquals(1, hits.totalHits);
 222         final Highlighter highlighter = new Highlighter(
 223             new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
 224             new QueryScorer(phraseQuery));
 225         final TokenStream tokenStream = TokenSources
 226             .getTokenStream(
 227                 (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
 228                 false);
 229         assertEquals("<B>the fox</B> did not jump",
 230             highlighter.getBestFragment(tokenStream, TEXT));
 231       } finally {
 232         indexSearcher.close();
 233       }
 234     } finally {
 235       indexReader.close();
 236       directory.close();
 237     }
 238   }
 239
 240   public void testOverlapWithPositionsAndOffsetExactPhrase()
 241       throws CorruptIndexException, LockObtainFailedException, IOException,
 242       InvalidTokenOffsetsException {
 243     final String TEXT = "the fox did not jump";
 244     final Directory directory = newDirectory();
 245     final IndexWriter indexWriter = new IndexWriter(directory,
 246         newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
 247     try {
 248       final Document document = new Document();
 249       document.add(new Field(FIELD, new TokenStreamOverlap(),
 250           TermVector.WITH_POSITIONS_OFFSETS));
 251       indexWriter.addDocument(document);
 252     } finally {
 253       indexWriter.close();
 254     }
 255     final IndexReader indexReader = IndexReader.open(directory, true);
 256     try {
 257       assertEquals(1, indexReader.numDocs());
 258       final IndexSearcher indexSearcher = newSearcher(indexReader);
 259       try {
 260         // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
 261         // query.add(new SpanTermQuery(new Term(FIELD, "the")));
 262         // query.add(new SpanTermQuery(new Term(FIELD, "fox")));
 263         final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
 264             new SpanTermQuery(new Term(FIELD, "the")),
 265             new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);
 266
 267         TopDocs hits = indexSearcher.search(phraseQuery, 1);
 268         assertEquals(1, hits.totalHits);
 269         final Highlighter highlighter = new Highlighter(
 270             new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
 271             new QueryScorer(phraseQuery));
 272         final TokenStream tokenStream = TokenSources
 273             .getTokenStream(
 274                 (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
 275                 false);
 276         assertEquals("<B>the fox</B> did not jump",
 277             highlighter.getBestFragment(tokenStream, TEXT));
 278       } finally {
 279         indexSearcher.close();
 280       }
 281     } finally {
 282       indexReader.close();
 283       directory.close();
 284     }
 285   }
 286
 287 }