lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 package org.apache.lucene.analysis;
  19
  20 import java.io.IOException;
  21 import java.util.Set;
  22
  23 import org.apache.lucene.analysis.CharArraySet;
  24 import org.apache.lucene.analysis.ReusableAnalyzerBase;
  25 import org.apache.lucene.analysis.WordlistLoader;
  26 import org.apache.lucene.util.Version;
  27
  28 /**
  29  * Base class for Analyzers that need to make use of stopword sets.
  30  *
  31  */
  32 public abstract class StopwordAnalyzerBase extends ReusableAnalyzerBase {
  33
  34   /**
  35    * An immutable stopword set
  36    */
  37   protected final CharArraySet stopwords;
  38
  39   protected final Version matchVersion;
  40
  41   /**
  42    * Returns the analyzer's stopword set or an empty set if the analyzer has no
  43    * stopwords
  44    *
  45    * @return the analyzer's stopword set or an empty set if the analyzer has no
  46    *         stopwords
  47    */
  48   public Set<?> getStopwordSet() {
  49     return stopwords;
  50   }
  51
  52   /**
  53    * Creates a new instance initialized with the given stopword set
  54    *
  55    * @param version
  56    *          the Lucene version for cross version compatibility
  57    * @param stopwords
  58    *          the analyzer's stopword set
  59    */
  60   protected StopwordAnalyzerBase(final Version version, final Set<?> stopwords) {
  61     matchVersion = version;
  62     // analyzers should use char array set for stopwords!
  63     this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet
  64         .unmodifiableSet(CharArraySet.copy(version, stopwords));
  65   }
  66
  67   /**
  68    * Creates a new Analyzer with an empty stopword set
  69    *
  70    * @param version
  71    *          the Lucene version for cross version compatibility
  72    */
  73   protected StopwordAnalyzerBase(final Version version) {
  74     this(version, null);
  75   }
  76
  77   /**
  78    * Creates a CharArraySet from a file resource associated with a class. (See
  79    * {@link Class#getResourceAsStream(String)}).
  80    *
  81    * @param ignoreCase
  82    *          <code>true</code> if the set should ignore the case of the
  83    *          stopwords, otherwise <code>false</code>
  84    * @param aClass
  85    *          a class that is associated with the given stopwordResource
  86    * @param resource
  87    *          name of the resource file associated with the given class
  88    * @param comment
  89    *          comment string to ignore in the stopword file
  90    * @return a CharArraySet containing the distinct stopwords from the given
  91    *         file
  92    * @throws IOException
  93    *           if loading the stopwords throws an {@link IOException}
  94    */
  95   protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
  96       final Class<? extends ReusableAnalyzerBase> aClass, final String resource,
  97       final String comment) throws IOException {
  98     final Set<String> wordSet = WordlistLoader.getWordSet(aClass, resource,
  99         comment);
 100     final CharArraySet set = new CharArraySet(Version.LUCENE_31, wordSet.size(), ignoreCase);
 101     set.addAll(wordSet);
 102     return set;
 103   }
 104
 105 }