lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java

   1 package org.apache.lucene.analysis.br;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.File;
  21 import java.io.IOException;
  22 import java.io.Reader;
  23 import java.util.Collections;
  24 import java.util.HashSet;
  25 import java.util.Map;
  26 import java.util.Set;
  27
  28 import org.apache.lucene.analysis.Analyzer;
  29 import org.apache.lucene.analysis.CharArraySet;
  30 import org.apache.lucene.analysis.LowerCaseFilter;
  31 import org.apache.lucene.analysis.KeywordMarkerFilter;
  32 import org.apache.lucene.analysis.StopFilter;
  33 import org.apache.lucene.analysis.StopwordAnalyzerBase;
  34 import org.apache.lucene.analysis.TokenStream;
  35 import org.apache.lucene.analysis.Tokenizer;
  36 import org.apache.lucene.analysis.WordlistLoader;
  37 import org.apache.lucene.analysis.standard.StandardAnalyzer;
  38 import org.apache.lucene.analysis.standard.StandardFilter;
  39 import org.apache.lucene.analysis.standard.StandardTokenizer;
  40 import org.apache.lucene.util.IOUtils;
  41 import org.apache.lucene.util.Version;
  42
  43 /**
  44  * {@link Analyzer} for Brazilian Portuguese language.
  45  * <p>
  46  * Supports an external list of stopwords (words that
  47  * will not be indexed at all) and an external list of exclusions (words that will
  48  * not be stemmed, but indexed).
  49  * </p>
  50  *
  51  * <p><b>NOTE</b>: This class uses the same {@link Version}
  52  * dependent settings as {@link StandardAnalyzer}.</p>
  53  */
  54 public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
  55
  56         /**
  57          * List of typical Brazilian Portuguese stopwords.
  58          * @deprecated use {@link #getDefaultStopSet()} instead
  59          */
  60         @Deprecated
  61         public final static String[] BRAZILIAN_STOP_WORDS = {
  62       "a","ainda","alem","ambas","ambos","antes",
  63       "ao","aonde","aos","apos","aquele","aqueles",
  64       "as","assim","com","como","contra","contudo",
  65       "cuja","cujas","cujo","cujos","da","das","de",
  66       "dela","dele","deles","demais","depois","desde",
  67       "desta","deste","dispoe","dispoem","diversa",
  68       "diversas","diversos","do","dos","durante","e",
  69       "ela","elas","ele","eles","em","entao","entre",
  70       "essa","essas","esse","esses","esta","estas",
  71       "este","estes","ha","isso","isto","logo","mais",
  72       "mas","mediante","menos","mesma","mesmas","mesmo",
  73       "mesmos","na","nas","nao","nas","nem","nesse","neste",
  74       "nos","o","os","ou","outra","outras","outro","outros",
  75       "pelas","pelas","pelo","pelos","perante","pois","por",
  76       "porque","portanto","proprio","propios","quais","qual",
  77       "qualquer","quando","quanto","que","quem","quer","se",
  78       "seja","sem","sendo","seu","seus","sob","sobre","sua",
  79       "suas","tal","tambem","teu","teus","toda","todas","todo",
  80       "todos","tua","tuas","tudo","um","uma","umas","uns"};
  81
  82   /** File containing default Brazilian Portuguese stopwords. */
  83   public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
  84
  85         /**
  86    * Returns an unmodifiable instance of the default stop-words set.
  87    * @return an unmodifiable instance of the default stop-words set.
  88    */
  89   public static Set<?> getDefaultStopSet(){
  90     return DefaultSetHolder.DEFAULT_STOP_SET;
  91   }
  92
  93   private static class DefaultSetHolder {
  94     static final Set<?> DEFAULT_STOP_SET;
  95
  96     static {
  97       try {
  98         DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(BrazilianAnalyzer.class,
  99             DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
 100       } catch (IOException ex) {
 101         // default set should always be present as it is part of the
 102         // distribution (JAR)
 103         throw new RuntimeException("Unable to load default stopword set");
 104       }
 105     }
 106   }
 107
 108
 109         /**
 110          * Contains words that should be indexed but not stemmed.
 111          */
 112         // TODO make this private in 3.1
 113         private Set<?> excltable = Collections.emptySet();
 114
 115         /**
 116          * Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
 117          */
 118         public BrazilianAnalyzer(Version matchVersion) {
 119     this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
 120         }
 121
 122         /**
 123    * Builds an analyzer with the given stop words
 124    *
 125    * @param matchVersion
 126    *          lucene compatibility version
 127    * @param stopwords
 128    *          a stopword set
 129    */
 130   public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords) {
 131      super(matchVersion, stopwords);
 132   }
 133
 134   /**
 135    * Builds an analyzer with the given stop words and stemming exclusion words
 136    *
 137    * @param matchVersion
 138    *          lucene compatibility version
 139    * @param stopwords
 140    *          a stopword set
 141    */
 142   public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords,
 143       Set<?> stemExclusionSet) {
 144     this(matchVersion, stopwords);
 145     excltable = CharArraySet.unmodifiableSet(CharArraySet
 146         .copy(matchVersion, stemExclusionSet));
 147   }
 148
 149         /**
 150          * Builds an analyzer with the given stop words.
 151          * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
 152          */
 153   @Deprecated
 154   public BrazilianAnalyzer(Version matchVersion, String... stopwords) {
 155     this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
 156   }
 157
 158   /**
 159    * Builds an analyzer with the given stop words.
 160    * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
 161    */
 162   @Deprecated
 163   public BrazilianAnalyzer(Version matchVersion, Map<?,?> stopwords) {
 164     this(matchVersion, stopwords.keySet());
 165   }
 166
 167   /**
 168    * Builds an analyzer with the given stop words.
 169    * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
 170    */
 171   @Deprecated
 172   public BrazilianAnalyzer(Version matchVersion, File stopwords)
 173       throws IOException {
 174     this(matchVersion, WordlistLoader.getWordSet(
 175         IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), matchVersion));
 176   }
 177
 178         /**
 179          * Builds an exclusionlist from an array of Strings.
 180          * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
 181          */
 182         @Deprecated
 183         public void setStemExclusionTable( String... exclusionlist ) {
 184                 excltable = StopFilter.makeStopSet( matchVersion, exclusionlist );
 185                 setPreviousTokenStream(null); // force a new stemmer to be created
 186         }
 187         /**
 188          * Builds an exclusionlist from a {@link Map}.
 189          * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
 190          */
 191         @Deprecated
 192         public void setStemExclusionTable( Map<?,?> exclusionlist ) {
 193                 excltable = new HashSet<Object>(exclusionlist.keySet());
 194                 setPreviousTokenStream(null); // force a new stemmer to be created
 195         }
 196         /**
 197          * Builds an exclusionlist from the words contained in the given file.
 198          * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
 199          */
 200         @Deprecated
 201         public void setStemExclusionTable( File exclusionlist ) throws IOException {
 202                 excltable = WordlistLoader.getWordSet(
 203                     IOUtils.getDecodingReader(exclusionlist, IOUtils.CHARSET_UTF_8), matchVersion);
 204                 setPreviousTokenStream(null); // force a new stemmer to be created
 205         }
 206
 207   /**
 208    * Creates
 209    * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
 210    * used to tokenize all the text in the provided {@link Reader}.
 211    *
 212    * @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
 213    *         built from a {@link StandardTokenizer} filtered with
 214    *         {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}
 215    *         , and {@link BrazilianStemFilter}.
 216    */
 217   @Override
 218   protected TokenStreamComponents createComponents(String fieldName,
 219       Reader reader) {
 220     Tokenizer source = new StandardTokenizer(matchVersion, reader);
 221     TokenStream result = new LowerCaseFilter(matchVersion, source);
 222     result = new StandardFilter(matchVersion, result);
 223     result = new StopFilter(matchVersion, result, stopwords);
 224     if(excltable != null && !excltable.isEmpty())
 225       result = new KeywordMarkerFilter(result, excltable);
 226     return new TokenStreamComponents(source, new BrazilianStemFilter(result));
 227   }
 228 }
 229