lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java

   1 package org.apache.lucene.analysis.cjk;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.Analyzer;
  21 import org.apache.lucene.analysis.CharArraySet;
  22 import org.apache.lucene.analysis.StopFilter;
  23 import org.apache.lucene.analysis.StopwordAnalyzerBase;
  24 import org.apache.lucene.analysis.Tokenizer;
  25 import org.apache.lucene.util.Version;
  26
  27 import java.io.Reader;
  28 import java.util.Arrays;
  29 import java.util.Set;
  30
  31
  32 /**
  33  * An {@link Analyzer} that tokenizes text with {@link CJKTokenizer} and
  34  * filters with {@link StopFilter}
  35  *
  36  */
  37 public final class CJKAnalyzer extends StopwordAnalyzerBase {
  38   //~ Static fields/initializers ---------------------------------------------
  39
  40   /**
  41    * An array containing some common English words that are not usually
  42    * useful for searching and some double-byte interpunctions.
  43    * @deprecated use {@link #getDefaultStopSet()} instead
  44    */
  45   @Deprecated
  46   public final static String[] STOP_WORDS = {
  47     "a", "and", "are", "as", "at", "be",
  48     "but", "by", "for", "if", "in",
  49     "into", "is", "it", "no", "not",
  50     "of", "on", "or", "s", "such", "t",
  51     "that", "the", "their", "then",
  52     "there", "these", "they", "this",
  53     "to", "was", "will", "with", "",
  54     "www"
  55   };
  56
  57   //~ Instance fields --------------------------------------------------------
  58
  59   /**
  60    * Returns an unmodifiable instance of the default stop-words set.
  61    * @return an unmodifiable instance of the default stop-words set.
  62    */
  63   public static Set<?> getDefaultStopSet(){
  64     return DefaultSetHolder.DEFAULT_STOP_SET;
  65   }
  66
  67   private static class DefaultSetHolder {
  68     static final Set<?> DEFAULT_STOP_SET = CharArraySet
  69         .unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS),
  70             false));
  71   }
  72
  73   //~ Constructors -----------------------------------------------------------
  74
  75   /**
  76    * Builds an analyzer which removes words in {@link #getDefaultStopSet()}.
  77    */
  78   public CJKAnalyzer(Version matchVersion) {
  79     this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
  80   }
  81
  82   /**
  83    * Builds an analyzer with the given stop words
  84    *
  85    * @param matchVersion
  86    *          lucene compatibility version
  87    * @param stopwords
  88    *          a stopword set
  89    */
  90   public CJKAnalyzer(Version matchVersion, Set<?> stopwords){
  91     super(matchVersion, stopwords);
  92   }
  93
  94   /**
  95    * Builds an analyzer which removes words in the provided array.
  96    *
  97    * @param stopWords stop word array
  98    * @deprecated use {@link #CJKAnalyzer(Version, Set)} instead
  99    */
 100   @Deprecated
 101   public CJKAnalyzer(Version matchVersion, String... stopWords) {
 102     super(matchVersion, StopFilter.makeStopSet(matchVersion, stopWords));
 103   }
 104
 105   //~ Methods ----------------------------------------------------------------
 106
 107   @Override
 108   protected TokenStreamComponents createComponents(String fieldName,
 109       Reader reader) {
 110     final Tokenizer source = new CJKTokenizer(reader);
 111     return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
 112   }
 113 }