lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 package org.apache.lucene.analysis.standard;
  19
  20 import org.apache.lucene.analysis.Tokenizer;
  21 import org.apache.lucene.analysis.standard.std31.StandardTokenizerImpl31;
  22 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  23 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  24 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  25 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
  26 import org.apache.lucene.util.AttributeSource;
  27 import org.apache.lucene.util.Version;
  28
  29 import java.io.IOException;
  30 import java.io.Reader;
  31
  32 /** A grammar-based tokenizer constructed with JFlex.
  33  * <p>
  34  * As of Lucene version 3.1, this class implements the Word Break rules from the
  35  * Unicode Text Segmentation algorithm, as specified in
  36  * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
  37  * <p/>
  38  * <p>Many applications have specific tokenizer needs.  If this tokenizer does
  39  * not suit your application, please consider copying this source code
  40  * directory to your project and maintaining your own grammar-based tokenizer.
  41  *
  42  * <a name="version"/>
  43  * <p>You must specify the required {@link Version}
  44  * compatibility when creating StandardTokenizer:
  45  * <ul>
  46  *   <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
  47  *   from their combining characters. If you use a previous version number,
  48  *   you get the exact broken behavior for backwards compatibility.
  49  *   <li> As of 3.1, StandardTokenizer implements Unicode text segmentation.
  50  *   If you use a previous version number, you get the exact behavior of
  51  *   {@link ClassicTokenizer} for backwards compatibility.
  52  * </ul>
  53  */
  54
  55 public final class StandardTokenizer extends Tokenizer {
  56   /** A private instance of the JFlex-constructed scanner */
  57   private StandardTokenizerInterface scanner;
  58
  59   public static final int ALPHANUM          = 0;
  60   /** @deprecated */
  61   @Deprecated
  62   public static final int APOSTROPHE        = 1;
  63   /** @deprecated */
  64   @Deprecated
  65   public static final int ACRONYM           = 2;
  66   /** @deprecated */
  67   @Deprecated
  68   public static final int COMPANY           = 3;
  69   public static final int EMAIL             = 4;
  70   /** @deprecated */
  71   @Deprecated
  72   public static final int HOST              = 5;
  73   public static final int NUM               = 6;
  74   /** @deprecated */
  75   @Deprecated
  76   public static final int CJ                = 7;
  77
  78   /**
  79    * @deprecated this solves a bug where HOSTs that end with '.' are identified
  80    *             as ACRONYMs.
  81    */
  82   @Deprecated
  83   public static final int ACRONYM_DEP       = 8;
  84
  85   public static final int SOUTHEAST_ASIAN = 9;
  86   public static final int IDEOGRAPHIC = 10;
  87   public static final int HIRAGANA = 11;
  88   public static final int KATAKANA = 12;
  89   public static final int HANGUL = 13;
  90
  91   /** String token types that correspond to token type int constants */
  92   public static final String [] TOKEN_TYPES = new String [] {
  93     "<ALPHANUM>",
  94     "<APOSTROPHE>",
  95     "<ACRONYM>",
  96     "<COMPANY>",
  97     "<EMAIL>",
  98     "<HOST>",
  99     "<NUM>",
 100     "<CJ>",
 101     "<ACRONYM_DEP>",
 102     "<SOUTHEAST_ASIAN>",
 103     "<IDEOGRAPHIC>",
 104     "<HIRAGANA>",
 105     "<KATAKANA>",
 106     "<HANGUL>"
 107   };
 108
 109   private boolean replaceInvalidAcronym;
 110
 111   private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
 112
 113   /** Set the max allowed token length.  Any token longer
 114    *  than this is skipped. */
 115   public void setMaxTokenLength(int length) {
 116     this.maxTokenLength = length;
 117   }
 118
 119   /** @see #setMaxTokenLength */
 120   public int getMaxTokenLength() {
 121     return maxTokenLength;
 122   }
 123
 124   /**
 125    * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}.  Attaches
 126    * the <code>input</code> to the newly created JFlex scanner.
 127    *
 128    * @param input The input reader
 129    *
 130    * See http://issues.apache.org/jira/browse/LUCENE-1068
 131    */
 132   public StandardTokenizer(Version matchVersion, Reader input) {
 133     super();
 134     init(input, matchVersion);
 135   }
 136
 137   /**
 138    * Creates a new StandardTokenizer with a given {@link AttributeSource}.
 139    */
 140   public StandardTokenizer(Version matchVersion, AttributeSource source, Reader input) {
 141     super(source);
 142     init(input, matchVersion);
 143   }
 144
 145   /**
 146    * Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}
 147    */
 148   public StandardTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
 149     super(factory);
 150     init(input, matchVersion);
 151   }
 152
 153   private final void init(Reader input, Version matchVersion) {
 154     if (matchVersion.onOrAfter(Version.LUCENE_34)) {
 155       this.scanner = new StandardTokenizerImpl(input);
 156     } else if (matchVersion.onOrAfter(Version.LUCENE_31)) {
 157       this.scanner = new StandardTokenizerImpl31(input);
 158     } else {
 159       this.scanner = new ClassicTokenizerImpl(input);
 160     }
 161     if (matchVersion.onOrAfter(Version.LUCENE_24)) {
 162       replaceInvalidAcronym = true;
 163     } else {
 164       replaceInvalidAcronym = false;
 165     }
 166     this.input = input;
 167   }
 168
 169   // this tokenizer generates three attributes:
 170   // term offset, positionIncrement and type
 171   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 172   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 173   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
 174   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
 175
 176   /*
 177    * (non-Javadoc)
 178    *
 179    * @see org.apache.lucene.analysis.TokenStream#next()
 180    */
 181   @Override
 182   public final boolean incrementToken() throws IOException {
 183     clearAttributes();
 184     int posIncr = 1;
 185
 186     while(true) {
 187       int tokenType = scanner.getNextToken();
 188
 189       if (tokenType == StandardTokenizerInterface.YYEOF) {
 190         return false;
 191       }
 192
 193       if (scanner.yylength() <= maxTokenLength) {
 194         posIncrAtt.setPositionIncrement(posIncr);
 195         scanner.getText(termAtt);
 196         final int start = scanner.yychar();
 197         offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
 198         // This 'if' should be removed in the next release. For now, it converts
 199         // invalid acronyms to HOST. When removed, only the 'else' part should
 200         // remain.
 201         if (tokenType == StandardTokenizer.ACRONYM_DEP) {
 202           if (replaceInvalidAcronym) {
 203             typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]);
 204             termAtt.setLength(termAtt.length() - 1); // remove extra '.'
 205           } else {
 206             typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ACRONYM]);
 207           }
 208         } else {
 209           typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
 210         }
 211         return true;
 212       } else
 213         // When we skip a too-long term, we still increment the
 214         // position increment
 215         posIncr++;
 216     }
 217   }
 218
 219   @Override
 220   public final void end() {
 221     // set final offset
 222     int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
 223     offsetAtt.setOffset(finalOffset, finalOffset);
 224   }
 225
 226   @Override
 227   public void reset(Reader reader) throws IOException {
 228     super.reset(reader);
 229     scanner.yyreset(reader);
 230   }
 231
 232   /**
 233    * Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com
 234    * when they should have been labeled as hosts instead.
 235    * @return true if StandardTokenizer now returns these tokens as Hosts, otherwise false
 236    *
 237    * @deprecated Remove in 3.X and make true the only valid value
 238    */
 239   @Deprecated
 240   public boolean isReplaceInvalidAcronym() {
 241     return replaceInvalidAcronym;
 242   }
 243
 244   /**
 245    *
 246    * @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms as HOST.
 247    * @deprecated Remove in 3.X and make true the only valid value
 248    *
 249    * See https://issues.apache.org/jira/browse/LUCENE-1068
 250    */
 251   @Deprecated
 252   public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
 253     this.replaceInvalidAcronym = replaceInvalidAcronym;
 254   }
 255 }