lucene-java-3.5.0/lucene/src/java/org/apache/lucene/analysis/LetterTokenizer.java

   1 package org.apache.lucene.analysis;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.Reader;
  21
  22 import org.apache.lucene.util.AttributeSource;
  23 import org.apache.lucene.util.Version;
  24
  25 /**
  26  * A LetterTokenizer is a tokenizer that divides text at non-letters. That's to
  27  * say, it defines tokens as maximal strings of adjacent letters, as defined by
  28  * java.lang.Character.isLetter() predicate.
  29  * <p>
  30  * Note: this does a decent job for most European languages, but does a terrible
  31  * job for some Asian languages, where words are not separated by spaces.
  32  * </p>
  33  * <p>
  34  * <a name="version"/>
  35  * You must specify the required {@link Version} compatibility when creating
  36  * {@link LetterTokenizer}:
  37  * <ul>
  38  * <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
  39  * detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
  40  * {@link CharTokenizer#normalize(int)} for details.</li>
  41  * </ul>
  42  * </p>
  43  */
  44
  45 public class LetterTokenizer extends CharTokenizer {
  46
  47   /**
  48    * Construct a new LetterTokenizer.
  49    *
  50    * @param matchVersion
  51    *          Lucene version to match See {@link <a href="#version">above</a>}
  52    * @param in
  53    *          the input to split up into tokens
  54    */
  55   public LetterTokenizer(Version matchVersion, Reader in) {
  56     super(matchVersion, in);
  57   }
  58
  59   /**
  60    * Construct a new LetterTokenizer using a given {@link AttributeSource}.
  61    *
  62    * @param matchVersion
  63    *          Lucene version to match See {@link <a href="#version">above</a>}
  64    * @param source
  65    *          the attribute source to use for this {@link Tokenizer}
  66    * @param in
  67    *          the input to split up into tokens
  68    */
  69   public LetterTokenizer(Version matchVersion, AttributeSource source, Reader in) {
  70     super(matchVersion, source, in);
  71   }
  72
  73   /**
  74    * Construct a new LetterTokenizer using a given
  75    * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
  76    *
  77    * @param matchVersion
  78    *          Lucene version to match See {@link <a href="#version">above</a>}
  79    * @param factory
  80    *          the attribute factory to use for this {@link Tokenizer}
  81    * @param in
  82    *          the input to split up into tokens
  83    */
  84   public LetterTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
  85     super(matchVersion, factory, in);
  86   }
  87
  88   /**
  89    * Construct a new LetterTokenizer.
  90    *
  91    * @deprecated use {@link #LetterTokenizer(Version, Reader)} instead. This
  92    *             will be removed in Lucene 4.0.
  93    */
  94   @Deprecated
  95   public LetterTokenizer(Reader in) {
  96     super(Version.LUCENE_30, in);
  97   }
  98
  99   /**
 100    * Construct a new LetterTokenizer using a given {@link AttributeSource}.
 101    * @deprecated
 102    * use {@link #LetterTokenizer(Version, AttributeSource, Reader)} instead.
 103    * This will be removed in Lucene 4.0.
 104    */
 105   @Deprecated
 106   public LetterTokenizer(AttributeSource source, Reader in) {
 107     super(Version.LUCENE_30, source, in);
 108   }
 109
 110   /**
 111    * Construct a new LetterTokenizer using a given
 112    * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
 113    *
 114    * @deprecated use {@link #LetterTokenizer(Version, AttributeSource.AttributeFactory, Reader)}
 115    *             instead. This will be removed in Lucene 4.0.
 116    */
 117   @Deprecated
 118   public LetterTokenizer(AttributeFactory factory, Reader in) {
 119     super(Version.LUCENE_30, factory, in);
 120   }
 121
 122   /** Collects only characters which satisfy
 123    * {@link Character#isLetter(int)}.*/
 124   @Override
 125   protected boolean isTokenChar(int c) {
 126     return Character.isLetter(c);
 127   }
 128 }