lucene-java-3.5.0/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizer.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 package org.apache.lucene.analysis.standard;
  19
  20 import java.io.IOException;
  21 import java.io.Reader;
  22
  23 import org.apache.lucene.analysis.Tokenizer;
  24 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  25 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  26 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  27 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
  28 import org.apache.lucene.util.AttributeSource;
  29 import org.apache.lucene.util.Version;
  30
  31 /** A grammar-based tokenizer constructed with JFlex
  32  *
  33  * <p> This should be a good tokenizer for most European-language documents:
  34  *
  35  * <ul>
  36  *   <li>Splits words at punctuation characters, removing punctuation. However, a
  37  *     dot that's not followed by whitespace is considered part of a token.
  38  *   <li>Splits words at hyphens, unless there's a number in the token, in which case
  39  *     the whole token is interpreted as a product number and is not split.
  40  *   <li>Recognizes email addresses and internet hostnames as one token.
  41  * </ul>
  42  *
  43  * <p>Many applications have specific tokenizer needs.  If this tokenizer does
  44  * not suit your application, please consider copying this source code
  45  * directory to your project and maintaining your own grammar-based tokenizer.
  46  *
  47  * <a name="version"/>
  48  * <p>You must specify the required {@link Version}
  49  * compatibility when creating ClassicAnalyzer:
  50  * <ul>
  51  *   <li> As of 2.4, Tokens incorrectly identified as acronyms
  52  *        are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a>
  53  * </ul>
  54  *
  55  * ClassicTokenizer was named StandardTokenizer in Lucene versions prior to 3.1.
  56  * As of 3.1, {@link StandardTokenizer} implements Unicode text segmentation,
  57  * as specified by UAX#29.
  58  */
  59
  60 public final class ClassicTokenizer extends Tokenizer {
  61   /** A private instance of the JFlex-constructed scanner */
  62   private StandardTokenizerInterface scanner;
  63
  64   public static final int ALPHANUM          = 0;
  65   public static final int APOSTROPHE        = 1;
  66   public static final int ACRONYM           = 2;
  67   public static final int COMPANY           = 3;
  68   public static final int EMAIL             = 4;
  69   public static final int HOST              = 5;
  70   public static final int NUM               = 6;
  71   public static final int CJ                = 7;
  72
  73   /**
  74    * @deprecated this solves a bug where HOSTs that end with '.' are identified
  75    *             as ACRONYMs.
  76    */
  77   @Deprecated
  78   public static final int ACRONYM_DEP       = 8;
  79
  80   /** String token types that correspond to token type int constants */
  81   public static final String [] TOKEN_TYPES = new String [] {
  82     "<ALPHANUM>",
  83     "<APOSTROPHE>",
  84     "<ACRONYM>",
  85     "<COMPANY>",
  86     "<EMAIL>",
  87     "<HOST>",
  88     "<NUM>",
  89     "<CJ>",
  90     "<ACRONYM_DEP>"
  91   };
  92
  93   private boolean replaceInvalidAcronym;
  94
  95   private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
  96
  97   /** Set the max allowed token length.  Any token longer
  98    *  than this is skipped. */
  99   public void setMaxTokenLength(int length) {
 100     this.maxTokenLength = length;
 101   }
 102
 103   /** @see #setMaxTokenLength */
 104   public int getMaxTokenLength() {
 105     return maxTokenLength;
 106   }
 107
 108   /**
 109    * Creates a new instance of the {@link ClassicTokenizer}.  Attaches
 110    * the <code>input</code> to the newly created JFlex scanner.
 111    *
 112    * @param input The input reader
 113    *
 114    * See http://issues.apache.org/jira/browse/LUCENE-1068
 115    */
 116   public ClassicTokenizer(Version matchVersion, Reader input) {
 117     super();
 118     init(input, matchVersion);
 119   }
 120
 121   /**
 122    * Creates a new ClassicTokenizer with a given {@link AttributeSource}.
 123    */
 124   public ClassicTokenizer(Version matchVersion, AttributeSource source, Reader input) {
 125     super(source);
 126     init(input, matchVersion);
 127   }
 128
 129   /**
 130    * Creates a new ClassicTokenizer with a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}
 131    */
 132   public ClassicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
 133     super(factory);
 134     init(input, matchVersion);
 135   }
 136
 137   private final void init(Reader input, Version matchVersion) {
 138     this.scanner = new ClassicTokenizerImpl(input);
 139
 140     if (matchVersion.onOrAfter(Version.LUCENE_24)) {
 141       replaceInvalidAcronym = true;
 142     } else {
 143       replaceInvalidAcronym = false;
 144     }
 145     this.input = input;
 146   }
 147
 148   // this tokenizer generates three attributes:
 149   // term offset, positionIncrement and type
 150   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 151   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 152   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
 153   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
 154
 155   /*
 156    * (non-Javadoc)
 157    *
 158    * @see org.apache.lucene.analysis.TokenStream#next()
 159    */
 160   @Override
 161   public final boolean incrementToken() throws IOException {
 162     clearAttributes();
 163     int posIncr = 1;
 164
 165     while(true) {
 166       int tokenType = scanner.getNextToken();
 167
 168       if (tokenType == StandardTokenizerInterface.YYEOF) {
 169         return false;
 170       }
 171
 172       if (scanner.yylength() <= maxTokenLength) {
 173         posIncrAtt.setPositionIncrement(posIncr);
 174         scanner.getText(termAtt);
 175         final int start = scanner.yychar();
 176         offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
 177         // This 'if' should be removed in the next release. For now, it converts
 178         // invalid acronyms to HOST. When removed, only the 'else' part should
 179         // remain.
 180         if (tokenType == ClassicTokenizer.ACRONYM_DEP) {
 181           if (replaceInvalidAcronym) {
 182             typeAtt.setType(ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.HOST]);
 183             termAtt.setLength(termAtt.length() - 1); // remove extra '.'
 184           } else {
 185             typeAtt.setType(ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM]);
 186           }
 187         } else {
 188           typeAtt.setType(ClassicTokenizer.TOKEN_TYPES[tokenType]);
 189         }
 190         return true;
 191       } else
 192         // When we skip a too-long term, we still increment the
 193         // position increment
 194         posIncr++;
 195     }
 196   }
 197
 198   @Override
 199   public final void end() {
 200     // set final offset
 201     int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
 202     offsetAtt.setOffset(finalOffset, finalOffset);
 203   }
 204
 205   @Override
 206   public void reset(Reader reader) throws IOException {
 207     super.reset(reader);
 208     scanner.yyreset(reader);
 209   }
 210
 211   /**
 212    * Prior to https://issues.apache.org/jira/browse/LUCENE-1068, ClassicTokenizer mischaracterized as acronyms tokens like www.abc.com
 213    * when they should have been labeled as hosts instead.
 214    * @return true if ClassicTokenizer now returns these tokens as Hosts, otherwise false
 215    *
 216    * @deprecated Remove in 3.X and make true the only valid value
 217    */
 218   @Deprecated
 219   public boolean isReplaceInvalidAcronym() {
 220     return replaceInvalidAcronym;
 221   }
 222
 223   /**
 224    *
 225    * @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms as HOST.
 226    * @deprecated Remove in 3.X and make true the only valid value
 227    *
 228    * See https://issues.apache.org/jira/browse/LUCENE-1068
 229    */
 230   @Deprecated
 231   public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
 232     this.replaceInvalidAcronym = replaceInvalidAcronym;
 233   }
 234 }