lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java

   1 package org.apache.lucene.analysis.cz;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.ReusableAnalyzerBase;
  21 import org.apache.lucene.analysis.Analyzer;
  22 import org.apache.lucene.analysis.CharArraySet;
  23 import org.apache.lucene.analysis.KeywordMarkerFilter;
  24 import org.apache.lucene.analysis.LowerCaseFilter;
  25 import org.apache.lucene.analysis.StopFilter;
  26 import org.apache.lucene.analysis.TokenStream;
  27 import org.apache.lucene.analysis.Tokenizer;
  28 import org.apache.lucene.analysis.WordlistLoader;
  29 import org.apache.lucene.analysis.standard.StandardFilter;
  30 import org.apache.lucene.analysis.standard.StandardTokenizer;
  31 import org.apache.lucene.util.Version;
  32
  33 import java.io.*;
  34 import java.util.Collections;
  35 import java.util.HashSet;
  36 import java.util.Set;
  37
  38 /**
  39  * {@link Analyzer} for Czech language.
  40  * <p>
  41  * Supports an external list of stopwords (words that will not be indexed at
  42  * all). A default set of stopwords is used unless an alternative list is
  43  * specified.
  44  * </p>
  45  *
  46  * <a name="version"/>
  47  * <p>
  48  * You must specify the required {@link Version} compatibility when creating
  49  * CzechAnalyzer:
  50  * <ul>
  51  * <li>As of 3.1, words are stemmed with {@link CzechStemFilter}
  52  * <li>As of 2.9, StopFilter preserves position increments
  53  * <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
  54  * <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
  55  * </ul>
  56  */
  57 public final class CzechAnalyzer extends ReusableAnalyzerBase {
  58
  59   /**
  60          * List of typical stopwords.
  61          * @deprecated use {@link #getDefaultStopSet()} instead
  62          */
  63         @Deprecated
  64         public final static String[] CZECH_STOP_WORDS = {
  65         "a","s","k","o","i","u","v","z","dnes","cz","t\u00edmto","bude\u0161","budem",
  66         "byli","jse\u0161","m\u016fj","sv\u00fdm","ta","tomto","tohle","tuto","tyto",
  67         "jej","zda","pro\u010d","m\u00e1te","tato","kam","tohoto","kdo","kte\u0159\u00ed",
  68         "mi","n\u00e1m","tom","tomuto","m\u00edt","nic","proto","kterou","byla",
  69         "toho","proto\u017ee","asi","ho","na\u0161i","napi\u0161te","re","co\u017e","t\u00edm",
  70         "tak\u017ee","sv\u00fdch","jej\u00ed","sv\u00fdmi","jste","aj","tu","tedy","teto",
  71         "bylo","kde","ke","prav\u00e9","ji","nad","nejsou","\u010di","pod","t\u00e9ma",
  72         "mezi","p\u0159es","ty","pak","v\u00e1m","ani","kdy\u017e","v\u0161ak","neg","jsem",
  73         "tento","\u010dl\u00e1nku","\u010dl\u00e1nky","aby","jsme","p\u0159ed","pta","jejich",
  74         "byl","je\u0161t\u011b","a\u017e","bez","tak\u00e9","pouze","prvn\u00ed","va\u0161e","kter\u00e1",
  75         "n\u00e1s","nov\u00fd","tipy","pokud","m\u016f\u017ee","strana","jeho","sv\u00e9","jin\u00e9",
  76         "zpr\u00e1vy","nov\u00e9","nen\u00ed","v\u00e1s","jen","podle","zde","u\u017e","b\u00fdt","v\u00edce",
  77         "bude","ji\u017e","ne\u017e","kter\u00fd","by","kter\u00e9","co","nebo","ten","tak",
  78         "m\u00e1","p\u0159i","od","po","jsou","jak","dal\u0161\u00ed","ale","si","se","ve",
  79         "to","jako","za","zp\u011bt","ze","do","pro","je","na","atd","atp",
  80         "jakmile","p\u0159i\u010dem\u017e","j\u00e1","on","ona","ono","oni","ony","my","vy",
  81         "j\u00ed","ji","m\u011b","mne","jemu","tomu","t\u011bm","t\u011bmu","n\u011bmu","n\u011bmu\u017e",
  82         "jeho\u017e","j\u00ed\u017e","jeliko\u017e","je\u017e","jako\u017e","na\u010de\u017e",
  83     };
  84
  85   /** File containing default Czech stopwords. */
  86   public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
  87
  88   /**
  89    * Returns a set of default Czech-stopwords
  90    *
  91    * @return a set of default Czech-stopwords
  92    */
  93         public static final Set<?> getDefaultStopSet(){
  94           return DefaultSetHolder.DEFAULT_SET;
  95         }
  96
  97         private static class DefaultSetHolder {
  98           private static final Set<?> DEFAULT_SET;
  99
 100           static {
 101             try {
 102               DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
 103                   Version.LUCENE_CURRENT, WordlistLoader.getWordSet(CzechAnalyzer.class,
 104                       DEFAULT_STOPWORD_FILE, "#"), false));
 105             } catch (IOException ex) {
 106               // default set should always be present as it is part of the
 107               // distribution (JAR)
 108               throw new RuntimeException("Unable to load default stopword set");
 109             }
 110           }
 111         }
 112
 113
 114   /**
 115    * Contains the stopwords used with the {@link StopFilter}.
 116    */
 117         // TODO once loadStopWords is gone those member should be removed too in favor of StopwordAnalyzerBase
 118         private Set<?> stoptable;
 119   private final Version matchVersion;
 120   private final Set<?> stemExclusionTable;
 121
 122   /**
 123    * Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
 124    *
 125    * @param matchVersion Lucene version to match See
 126    *          {@link <a href="#version">above</a>}
 127    */
 128         public CzechAnalyzer(Version matchVersion) {
 129     this(matchVersion, DefaultSetHolder.DEFAULT_SET);
 130         }
 131
 132   /**
 133    * Builds an analyzer with the given stop words.
 134    *
 135    * @param matchVersion Lucene version to match See
 136    *          {@link <a href="#version">above</a>}
 137    * @param stopwords a stopword set
 138    */
 139   public CzechAnalyzer(Version matchVersion, Set<?> stopwords) {
 140     this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
 141   }
 142
 143   /**
 144    * Builds an analyzer with the given stop words and a set of work to be
 145    * excluded from the {@link CzechStemFilter}.
 146    *
 147    * @param matchVersion Lucene version to match See
 148    *          {@link <a href="#version">above</a>}
 149    * @param stopwords a stopword set
 150    * @param stemExclusionTable a stemming exclusion set
 151    */
 152   public CzechAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable) {
 153     this.matchVersion = matchVersion;
 154     this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
 155     this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
 156   }
 157
 158
 159   /**
 160    * Builds an analyzer with the given stop words.
 161    *
 162    * @param matchVersion Lucene version to match See
 163    *          {@link <a href="#version">above</a>}
 164    * @param stopwords a stopword set
 165    * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
 166    */
 167   @Deprecated
 168   public CzechAnalyzer(Version matchVersion, String... stopwords) {
 169     this(matchVersion, StopFilter.makeStopSet( matchVersion, stopwords ));
 170         }
 171
 172   /**
 173    * Builds an analyzer with the given stop words.
 174    *
 175    * @param matchVersion Lucene version to match See
 176    *          {@link <a href="#version">above</a>}
 177    * @param stopwords a stopword set
 178    * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
 179    */
 180   @Deprecated
 181   public CzechAnalyzer(Version matchVersion, HashSet<?> stopwords) {
 182     this(matchVersion, (Set<?>)stopwords);
 183         }
 184
 185   /**
 186    * Builds an analyzer with the given stop words.
 187    *
 188    * @param matchVersion Lucene version to match See
 189    *          {@link <a href="#version">above</a>}
 190    * @param stopwords a file containing stopwords
 191    * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
 192    */
 193   @Deprecated
 194   public CzechAnalyzer(Version matchVersion, File stopwords ) throws IOException {
 195     this(matchVersion, (Set<?>)WordlistLoader.getWordSet( stopwords ));
 196         }
 197
 198     /**
 199      * Loads stopwords hash from resource stream (file, database...).
 200      * @param   wordfile    File containing the wordlist
 201      * @param   encoding    Encoding used (win-1250, iso-8859-2, ...), null for default system encoding
 202      * @deprecated use {@link WordlistLoader#getWordSet(Reader, String) }
 203      *             and {@link #CzechAnalyzer(Version, Set)} instead
 204      */
 205     // TODO extend StopwordAnalyzerBase once this method is gone!
 206     @Deprecated
 207     public void loadStopWords( InputStream wordfile, String encoding ) {
 208         setPreviousTokenStream(null); // force a new stopfilter to be created
 209         if ( wordfile == null ) {
 210             stoptable = Collections.emptySet();
 211             return;
 212         }
 213         try {
 214             // clear any previous table (if present)
 215             stoptable = Collections.emptySet();
 216
 217             InputStreamReader isr;
 218             if (encoding == null)
 219                 isr = new InputStreamReader(wordfile);
 220             else
 221                 isr = new InputStreamReader(wordfile, encoding);
 222
 223             stoptable = WordlistLoader.getWordSet(isr);
 224         } catch ( IOException e ) {
 225           // clear any previous table (if present)
 226           // TODO: throw IOException
 227           stoptable = Collections.emptySet();
 228         }
 229     }
 230
 231   /**
 232    * Creates
 233    * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
 234    * used to tokenize all the text in the provided {@link Reader}.
 235    *
 236    * @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
 237    *         built from a {@link StandardTokenizer} filtered with
 238    *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
 239    *         , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If
 240    *         a version is >= LUCENE_31 and a stem exclusion set is provided via
 241    *         {@link #CzechAnalyzer(Version, Set, Set)} a
 242    *         {@link KeywordMarkerFilter} is added before
 243    *         {@link CzechStemFilter}.
 244    */
 245   @Override
 246   protected TokenStreamComponents createComponents(String fieldName,
 247       Reader reader) {
 248     final Tokenizer source = new StandardTokenizer(matchVersion, reader);
 249     TokenStream result = new StandardFilter(matchVersion, source);
 250     result = new LowerCaseFilter(matchVersion, result);
 251     result = new StopFilter( matchVersion, result, stoptable);
 252     if (matchVersion.onOrAfter(Version.LUCENE_31)) {
 253       if(!this.stemExclusionTable.isEmpty())
 254         result = new KeywordMarkerFilter(result, stemExclusionTable);
 255       result = new CzechStemFilter(result);
 256     }
 257     return new TokenStreamComponents(source, result);
 258   }
 259 }
 260