lucene-java-3.4.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java

   1 /*
   2  * Created on 28-Oct-2004
   3  */
   4 package org.apache.lucene.search.highlight;
   5
   6 /**
   7  * Licensed to the Apache Software Foundation (ASF) under one or more
   8  * contributor license agreements.  See the NOTICE file distributed with
   9  * this work for additional information regarding copyright ownership.
  10  * The ASF licenses this file to You under the Apache License, Version 2.0
  11  * (the "License"); you may not use this file except in compliance with
  12  * the License.  You may obtain a copy of the License at
  13  *
  14  *     http://www.apache.org/licenses/LICENSE-2.0
  15  *
  16  * Unless required by applicable law or agreed to in writing, software
  17  * distributed under the License is distributed on an "AS IS" BASIS,
  18  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  19  * See the License for the specific language governing permissions and
  20  * limitations under the License.
  21  */
  22
  23 import java.io.IOException;
  24 import java.io.StringReader;
  25 import java.util.ArrayList;
  26 import java.util.Comparator;
  27
  28 import org.apache.lucene.analysis.Analyzer;
  29 import org.apache.lucene.analysis.Token;
  30 import org.apache.lucene.analysis.TokenStream;
  31 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  32 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  33 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  34 import org.apache.lucene.document.Document;
  35 import org.apache.lucene.index.IndexReader;
  36 import org.apache.lucene.index.TermFreqVector;
  37 import org.apache.lucene.index.TermPositionVector;
  38 import org.apache.lucene.index.TermVectorOffsetInfo;
  39 import org.apache.lucene.util.ArrayUtil;
  40
  41 /**
  42  * Hides implementation issues associated with obtaining a TokenStream for use
  43  * with the higlighter - can obtain from TermFreqVectors with offsets and
  44  * (optionally) positions or from Analyzer class reparsing the stored content.
  45  */
  46 public class TokenSources {
  47   /**
  48    * A convenience method that tries to first get a TermPositionVector for the
  49    * specified docId, then, falls back to using the passed in
  50    * {@link org.apache.lucene.document.Document} to retrieve the TokenStream.
  51    * This is useful when you already have the document, but would prefer to use
  52    * the vector first.
  53    *
  54    * @param reader The {@link org.apache.lucene.index.IndexReader} to use to try
  55    *        and get the vector from
  56    * @param docId The docId to retrieve.
  57    * @param field The field to retrieve on the document
  58    * @param doc The document to fall back on
  59    * @param analyzer The analyzer to use for creating the TokenStream if the
  60    *        vector doesn't exist
  61    * @return The {@link org.apache.lucene.analysis.TokenStream} for the
  62    *         {@link org.apache.lucene.document.Fieldable} on the
  63    *         {@link org.apache.lucene.document.Document}
  64    * @throws IOException if there was an error loading
  65    */
  66   public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
  67       String field, Document doc, Analyzer analyzer) throws IOException {
  68     TokenStream ts = null;
  69
  70     TermFreqVector tfv = reader.getTermFreqVector(docId, field);
  71     if (tfv != null) {
  72       if (tfv instanceof TermPositionVector) {
  73         ts = getTokenStream((TermPositionVector) tfv);
  74       }
  75     }
  76     // No token info stored so fall back to analyzing raw content
  77     if (ts == null) {
  78       ts = getTokenStream(doc, field, analyzer);
  79     }
  80     return ts;
  81   }
  82
  83   /**
  84    * A convenience method that tries a number of approaches to getting a token
  85    * stream. The cost of finding there are no termVectors in the index is
  86    * minimal (1000 invocations still registers 0 ms). So this "lazy" (flexible?)
  87    * approach to coding is probably acceptable
  88    *
  89    * @param reader
  90    * @param docId
  91    * @param field
  92    * @param analyzer
  93    * @return null if field not stored correctly
  94    * @throws IOException
  95    */
  96   public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
  97       String field, Analyzer analyzer) throws IOException {
  98     TokenStream ts = null;
  99
 100     TermFreqVector tfv = reader.getTermFreqVector(docId, field);
 101     if (tfv != null) {
 102       if (tfv instanceof TermPositionVector) {
 103         ts = getTokenStream((TermPositionVector) tfv);
 104       }
 105     }
 106     // No token info stored so fall back to analyzing raw content
 107     if (ts == null) {
 108       ts = getTokenStream(reader, docId, field, analyzer);
 109     }
 110     return ts;
 111   }
 112
 113   public static TokenStream getTokenStream(TermPositionVector tpv) {
 114     // assumes the worst and makes no assumptions about token position
 115     // sequences.
 116     return getTokenStream(tpv, false);
 117   }
 118
 119   /**
 120    * Low level api. Returns a token stream or null if no offset info available
 121    * in index. This can be used to feed the highlighter with a pre-parsed token
 122    * stream
 123    *
 124    * In my tests the speeds to recreate 1000 token streams using this method
 125    * are: - with TermVector offset only data stored - 420 milliseconds - with
 126    * TermVector offset AND position data stored - 271 milliseconds (nb timings
 127    * for TermVector with position data are based on a tokenizer with contiguous
 128    * positions - no overlaps or gaps) The cost of not using TermPositionVector
 129    * to store pre-parsed content and using an analyzer to re-parse the original
 130    * content: - reanalyzing the original content - 980 milliseconds
 131    *
 132    * The re-analyze timings will typically vary depending on - 1) The complexity
 133    * of the analyzer code (timings above were using a
 134    * stemmer/lowercaser/stopword combo) 2) The number of other fields (Lucene
 135    * reads ALL fields off the disk when accessing just one document field - can
 136    * cost dear!) 3) Use of compression on field storage - could be faster due to
 137    * compression (less disk IO) or slower (more CPU burn) depending on the
 138    * content.
 139    *
 140    * @param tpv
 141    * @param tokenPositionsGuaranteedContiguous true if the token position
 142    *        numbers have no overlaps or gaps. If looking to eek out the last
 143    *        drops of performance, set to true. If in doubt, set to false.
 144    */
 145   public static TokenStream getTokenStream(TermPositionVector tpv,
 146       boolean tokenPositionsGuaranteedContiguous) {
 147     if (!tokenPositionsGuaranteedContiguous && tpv.getTermPositions(0) != null) {
 148       return new TokenStreamFromTermPositionVector(tpv);
 149     }
 150
 151     // an object used to iterate across an array of tokens
 152     final class StoredTokenStream extends TokenStream {
 153       Token tokens[];
 154
 155       int currentToken = 0;
 156
 157       CharTermAttribute termAtt;
 158
 159       OffsetAttribute offsetAtt;
 160
 161       PositionIncrementAttribute posincAtt;
 162
 163       StoredTokenStream(Token tokens[]) {
 164         this.tokens = tokens;
 165         termAtt = addAttribute(CharTermAttribute.class);
 166         offsetAtt = addAttribute(OffsetAttribute.class);
 167         posincAtt = addAttribute(PositionIncrementAttribute.class);
 168       }
 169
 170       @Override
 171       public boolean incrementToken() throws IOException {
 172         if (currentToken >= tokens.length) {
 173           return false;
 174         }
 175         Token token = tokens[currentToken++];
 176         clearAttributes();
 177         termAtt.setEmpty().append(token);
 178         offsetAtt.setOffset(token.startOffset(), token.endOffset());
 179         posincAtt
 180             .setPositionIncrement(currentToken <= 1
 181                 || tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
 182                     .startOffset() ? 1 : 0);
 183         return true;
 184       }
 185     }
 186     // code to reconstruct the original sequence of Tokens
 187     String[] terms = tpv.getTerms();
 188     int[] freq = tpv.getTermFrequencies();
 189     int totalTokens = 0;
 190     for (int t = 0; t < freq.length; t++) {
 191       totalTokens += freq[t];
 192     }
 193     Token tokensInOriginalOrder[] = new Token[totalTokens];
 194     ArrayList<Token> unsortedTokens = null;
 195     for (int t = 0; t < freq.length; t++) {
 196       TermVectorOffsetInfo[] offsets = tpv.getOffsets(t);
 197       if (offsets == null) {
 198         throw new IllegalArgumentException(
 199             "Required TermVector Offset information was not found");
 200       }
 201
 202       int[] pos = null;
 203       if (tokenPositionsGuaranteedContiguous) {
 204         // try get the token position info to speed up assembly of tokens into
 205         // sorted sequence
 206         pos = tpv.getTermPositions(t);
 207       }
 208       if (pos == null) {
 209         // tokens NOT stored with positions or not guaranteed contiguous - must
 210         // add to list and sort later
 211         if (unsortedTokens == null) {
 212           unsortedTokens = new ArrayList<Token>();
 213         }
 214         for (int tp = 0; tp < offsets.length; tp++) {
 215           Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp]
 216               .getEndOffset());
 217           unsortedTokens.add(token);
 218         }
 219       } else {
 220         // We have positions stored and a guarantee that the token position
 221         // information is contiguous
 222
 223         // This may be fast BUT wont work if Tokenizers used which create >1
 224         // token in same position or
 225         // creates jumps in position numbers - this code would fail under those
 226         // circumstances
 227
 228         // tokens stored with positions - can use this to index straight into
 229         // sorted array
 230         for (int tp = 0; tp < pos.length; tp++) {
 231           Token token = new Token(terms[t], offsets[tp].getStartOffset(),
 232               offsets[tp].getEndOffset());
 233           tokensInOriginalOrder[pos[tp]] = token;
 234         }
 235       }
 236     }
 237     // If the field has been stored without position data we must perform a sort
 238     if (unsortedTokens != null) {
 239       tokensInOriginalOrder = unsortedTokens.toArray(new Token[unsortedTokens
 240           .size()]);
 241       ArrayUtil.mergeSort(tokensInOriginalOrder, new Comparator<Token>() {
 242         public int compare(Token t1, Token t2) {
 243           if (t1.startOffset() == t2.startOffset()) return t1.endOffset()
 244               - t2.endOffset();
 245           else return t1.startOffset() - t2.startOffset();
 246         }
 247       });
 248     }
 249     return new StoredTokenStream(tokensInOriginalOrder);
 250   }
 251
 252   public static TokenStream getTokenStream(IndexReader reader, int docId,
 253       String field) throws IOException {
 254     TermFreqVector tfv = reader.getTermFreqVector(docId, field);
 255     if (tfv == null) {
 256       throw new IllegalArgumentException(field + " in doc #" + docId
 257           + "does not have any term position data stored");
 258     }
 259     if (tfv instanceof TermPositionVector) {
 260       TermPositionVector tpv = (TermPositionVector) reader.getTermFreqVector(
 261           docId, field);
 262       return getTokenStream(tpv);
 263     }
 264     throw new IllegalArgumentException(field + " in doc #" + docId
 265         + "does not have any term position data stored");
 266   }
 267
 268   // convenience method
 269   public static TokenStream getTokenStream(IndexReader reader, int docId,
 270       String field, Analyzer analyzer) throws IOException {
 271     Document doc = reader.document(docId);
 272     return getTokenStream(doc, field, analyzer);
 273   }
 274
 275   public static TokenStream getTokenStream(Document doc, String field,
 276       Analyzer analyzer) {
 277     String contents = doc.get(field);
 278     if (contents == null) {
 279       throw new IllegalArgumentException("Field " + field
 280           + " in document is not stored and cannot be analyzed");
 281     }
 282     return getTokenStream(field, contents, analyzer);
 283   }
 284
 285   // convenience method
 286   public static TokenStream getTokenStream(String field, String contents,
 287       Analyzer analyzer) {
 288     try {
 289       return analyzer.reusableTokenStream(field, new StringReader(contents));
 290     } catch (IOException ex) {
 291       throw new RuntimeException(ex);
 292     }
 293   }
 294
 295 }