lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java

   1 package org.apache.lucene.analysis.br;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.File;
  21 import java.io.IOException;
  22 import java.io.Reader;
  23 import java.util.Collections;
  24 import java.util.HashSet;
  25 import java.util.Map;
  26 import java.util.Set;
  27
  28 import org.apache.lucene.analysis.Analyzer;
  29 import org.apache.lucene.analysis.CharArraySet;
  30 import org.apache.lucene.analysis.LowerCaseFilter;
  31 import org.apache.lucene.analysis.KeywordMarkerFilter;
  32 import org.apache.lucene.analysis.StopFilter;
  33 import org.apache.lucene.analysis.StopwordAnalyzerBase;
  34 import org.apache.lucene.analysis.TokenStream;
  35 import org.apache.lucene.analysis.Tokenizer;
  36 import org.apache.lucene.analysis.WordlistLoader;
  37 import org.apache.lucene.analysis.standard.StandardAnalyzer;
  38 import org.apache.lucene.analysis.standard.StandardFilter;
  39 import org.apache.lucene.analysis.standard.StandardTokenizer;
  40 import org.apache.lucene.util.Version;
  41
  42 /**
  43  * {@link Analyzer} for Brazilian Portuguese language.
  44  * <p>
  45  * Supports an external list of stopwords (words that
  46  * will not be indexed at all) and an external list of exclusions (words that will
  47  * not be stemmed, but indexed).
  48  * </p>
  49  *
  50  * <p><b>NOTE</b>: This class uses the same {@link Version}
  51  * dependent settings as {@link StandardAnalyzer}.</p>
  52  */
  53 public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
  54
  55         /**
  56          * List of typical Brazilian Portuguese stopwords.
  57          * @deprecated use {@link #getDefaultStopSet()} instead
  58          */
  59         @Deprecated
  60         public final static String[] BRAZILIAN_STOP_WORDS = {
  61       "a","ainda","alem","ambas","ambos","antes",
  62       "ao","aonde","aos","apos","aquele","aqueles",
  63       "as","assim","com","como","contra","contudo",
  64       "cuja","cujas","cujo","cujos","da","das","de",
  65       "dela","dele","deles","demais","depois","desde",
  66       "desta","deste","dispoe","dispoem","diversa",
  67       "diversas","diversos","do","dos","durante","e",
  68       "ela","elas","ele","eles","em","entao","entre",
  69       "essa","essas","esse","esses","esta","estas",
  70       "este","estes","ha","isso","isto","logo","mais",
  71       "mas","mediante","menos","mesma","mesmas","mesmo",
  72       "mesmos","na","nas","nao","nas","nem","nesse","neste",
  73       "nos","o","os","ou","outra","outras","outro","outros",
  74       "pelas","pelas","pelo","pelos","perante","pois","por",
  75       "porque","portanto","proprio","propios","quais","qual",
  76       "qualquer","quando","quanto","que","quem","quer","se",
  77       "seja","sem","sendo","seu","seus","sob","sobre","sua",
  78       "suas","tal","tambem","teu","teus","toda","todas","todo",
  79       "todos","tua","tuas","tudo","um","uma","umas","uns"};
  80
  81   /** File containing default Brazilian Portuguese stopwords. */
  82   public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
  83
  84         /**
  85    * Returns an unmodifiable instance of the default stop-words set.
  86    * @return an unmodifiable instance of the default stop-words set.
  87    */
  88   public static Set<?> getDefaultStopSet(){
  89     return DefaultSetHolder.DEFAULT_STOP_SET;
  90   }
  91
  92   private static class DefaultSetHolder {
  93     static final Set<?> DEFAULT_STOP_SET;
  94
  95     static {
  96       try {
  97         DEFAULT_STOP_SET = CharArraySet.unmodifiableSet(new CharArraySet(
  98             Version.LUCENE_CURRENT, WordlistLoader.getWordSet(BrazilianAnalyzer.class,
  99                 DEFAULT_STOPWORD_FILE, "#"), false));
 100       } catch (IOException ex) {
 101         // default set should always be present as it is part of the
 102         // distribution (JAR)
 103         throw new RuntimeException("Unable to load default stopword set");
 104       }
 105     }
 106   }
 107
 108
 109         /**
 110          * Contains words that should be indexed but not stemmed.
 111          */
 112         // TODO make this private in 3.1
 113         private Set<?> excltable = Collections.emptySet();
 114
 115         /**
 116          * Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
 117          */
 118         public BrazilianAnalyzer(Version matchVersion) {
 119     this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
 120         }
 121
 122         /**
 123    * Builds an analyzer with the given stop words
 124    *
 125    * @param matchVersion
 126    *          lucene compatibility version
 127    * @param stopwords
 128    *          a stopword set
 129    */
 130   public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords) {
 131      super(matchVersion, stopwords);
 132   }
 133
 134   /**
 135    * Builds an analyzer with the given stop words and stemming exclusion words
 136    *
 137    * @param matchVersion
 138    *          lucene compatibility version
 139    * @param stopwords
 140    *          a stopword set
 141    */
 142   public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords,
 143       Set<?> stemExclusionSet) {
 144     this(matchVersion, stopwords);
 145     excltable = CharArraySet.unmodifiableSet(CharArraySet
 146         .copy(matchVersion, stemExclusionSet));
 147   }
 148
 149         /**
 150          * Builds an analyzer with the given stop words.
 151          * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
 152          */
 153   @Deprecated
 154   public BrazilianAnalyzer(Version matchVersion, String... stopwords) {
 155     this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
 156   }
 157
 158   /**
 159    * Builds an analyzer with the given stop words.
 160    * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
 161    */
 162   @Deprecated
 163   public BrazilianAnalyzer(Version matchVersion, Map<?,?> stopwords) {
 164     this(matchVersion, stopwords.keySet());
 165   }
 166
 167   /**
 168    * Builds an analyzer with the given stop words.
 169    * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
 170    */
 171   @Deprecated
 172   public BrazilianAnalyzer(Version matchVersion, File stopwords)
 173       throws IOException {
 174     this(matchVersion, WordlistLoader.getWordSet(stopwords));
 175   }
 176
 177         /**
 178          * Builds an exclusionlist from an array of Strings.
 179          * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
 180          */
 181         @Deprecated
 182         public void setStemExclusionTable( String... exclusionlist ) {
 183                 excltable = StopFilter.makeStopSet( matchVersion, exclusionlist );
 184                 setPreviousTokenStream(null); // force a new stemmer to be created
 185         }
 186         /**
 187          * Builds an exclusionlist from a {@link Map}.
 188          * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
 189          */
 190         @Deprecated
 191         public void setStemExclusionTable( Map<?,?> exclusionlist ) {
 192                 excltable = new HashSet<Object>(exclusionlist.keySet());
 193                 setPreviousTokenStream(null); // force a new stemmer to be created
 194         }
 195         /**
 196          * Builds an exclusionlist from the words contained in the given file.
 197          * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
 198          */
 199         @Deprecated
 200         public void setStemExclusionTable( File exclusionlist ) throws IOException {
 201                 excltable = WordlistLoader.getWordSet( exclusionlist );
 202                 setPreviousTokenStream(null); // force a new stemmer to be created
 203         }
 204
 205   /**
 206    * Creates
 207    * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
 208    * used to tokenize all the text in the provided {@link Reader}.
 209    *
 210    * @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
 211    *         built from a {@link StandardTokenizer} filtered with
 212    *         {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}
 213    *         , and {@link BrazilianStemFilter}.
 214    */
 215   @Override
 216   protected TokenStreamComponents createComponents(String fieldName,
 217       Reader reader) {
 218     Tokenizer source = new StandardTokenizer(matchVersion, reader);
 219     TokenStream result = new LowerCaseFilter(matchVersion, source);
 220     result = new StandardFilter(matchVersion, result);
 221     result = new StopFilter(matchVersion, result, stopwords);
 222     if(excltable != null && !excltable.isEmpty())
 223       result = new KeywordMarkerFilter(result, excltable);
 224     return new TokenStreamComponents(source, new BrazilianStemFilter(result));
 225   }
 226 }
 227