lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/standard/StandardFilter.java

   1 package org.apache.lucene.analysis.standard;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21
  22 import org.apache.lucene.analysis.TokenFilter;
  23 import org.apache.lucene.analysis.TokenStream;
  24 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  25 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
  26 import org.apache.lucene.util.Version;
  27
  28 /**
  29  * Normalizes tokens extracted with {@link StandardTokenizer}.
  30  */
  31 public class StandardFilter extends TokenFilter {
  32   private final Version matchVersion;
  33
  34   /** @deprecated Use {@link #StandardFilter(Version, TokenStream)} instead. */
  35   @Deprecated
  36   public StandardFilter(TokenStream in) {
  37     this(Version.LUCENE_30, in);
  38   }
  39
  40   public StandardFilter(Version matchVersion, TokenStream in) {
  41     super(in);
  42     this.matchVersion = matchVersion;
  43   }
  44
  45   private static final String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
  46   private static final String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
  47
  48   // this filters uses attribute type
  49   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
  50   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  51
  52   @Override
  53   public final boolean incrementToken() throws IOException {
  54     if (matchVersion.onOrAfter(Version.LUCENE_31))
  55       return input.incrementToken(); // TODO: add some niceties for the new grammar
  56     else
  57       return incrementTokenClassic();
  58   }
  59
  60   public final boolean incrementTokenClassic() throws IOException {
  61     if (!input.incrementToken()) {
  62       return false;
  63     }
  64
  65     final char[] buffer = termAtt.buffer();
  66     final int bufferLength = termAtt.length();
  67     final String type = typeAtt.type();
  68
  69     if (type == APOSTROPHE_TYPE &&      // remove 's
  70         bufferLength >= 2 &&
  71         buffer[bufferLength-2] == '\'' &&
  72         (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
  73       // Strip last 2 characters off
  74       termAtt.setLength(bufferLength - 2);
  75     } else if (type == ACRONYM_TYPE) {      // remove dots
  76       int upto = 0;
  77       for(int i=0;i<bufferLength;i++) {
  78         char c = buffer[i];
  79         if (c != '.')
  80           buffer[upto++] = c;
  81       }
  82       termAtt.setLength(upto);
  83     }
  84
  85     return true;
  86   }
  87 }