lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java

   1 package org.apache.lucene.analysis.cz;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.ReusableAnalyzerBase;
  21 import org.apache.lucene.analysis.Analyzer;
  22 import org.apache.lucene.analysis.CharArraySet;
  23 import org.apache.lucene.analysis.KeywordMarkerFilter;
  24 import org.apache.lucene.analysis.LowerCaseFilter;
  25 import org.apache.lucene.analysis.StopFilter;
  26 import org.apache.lucene.analysis.TokenStream;
  27 import org.apache.lucene.analysis.Tokenizer;
  28 import org.apache.lucene.analysis.WordlistLoader;
  29 import org.apache.lucene.analysis.standard.StandardFilter;
  30 import org.apache.lucene.analysis.standard.StandardTokenizer;
  31 import org.apache.lucene.util.IOUtils;
  32 import org.apache.lucene.util.Version;
  33
  34 import java.io.*;
  35 import java.nio.charset.Charset;
  36 import java.util.Collections;
  37 import java.util.HashSet;
  38 import java.util.Set;
  39
  40 /**
  41  * {@link Analyzer} for Czech language.
  42  * <p>
  43  * Supports an external list of stopwords (words that will not be indexed at
  44  * all). A default set of stopwords is used unless an alternative list is
  45  * specified.
  46  * </p>
  47  *
  48  * <a name="version"/>
  49  * <p>
  50  * You must specify the required {@link Version} compatibility when creating
  51  * CzechAnalyzer:
  52  * <ul>
  53  * <li>As of 3.1, words are stemmed with {@link CzechStemFilter}
  54  * <li>As of 2.9, StopFilter preserves position increments
  55  * <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
  56  * <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
  57  * </ul>
  58  */
  59 public final class CzechAnalyzer extends ReusableAnalyzerBase {
  60
  61   /**
  62          * List of typical stopwords.
  63          * @deprecated use {@link #getDefaultStopSet()} instead
  64          */
  65         @Deprecated
  66         public final static String[] CZECH_STOP_WORDS = {
  67         "a","s","k","o","i","u","v","z","dnes","cz","t\u00edmto","bude\u0161","budem",
  68         "byli","jse\u0161","m\u016fj","sv\u00fdm","ta","tomto","tohle","tuto","tyto",
  69         "jej","zda","pro\u010d","m\u00e1te","tato","kam","tohoto","kdo","kte\u0159\u00ed",
  70         "mi","n\u00e1m","tom","tomuto","m\u00edt","nic","proto","kterou","byla",
  71         "toho","proto\u017ee","asi","ho","na\u0161i","napi\u0161te","re","co\u017e","t\u00edm",
  72         "tak\u017ee","sv\u00fdch","jej\u00ed","sv\u00fdmi","jste","aj","tu","tedy","teto",
  73         "bylo","kde","ke","prav\u00e9","ji","nad","nejsou","\u010di","pod","t\u00e9ma",
  74         "mezi","p\u0159es","ty","pak","v\u00e1m","ani","kdy\u017e","v\u0161ak","neg","jsem",
  75         "tento","\u010dl\u00e1nku","\u010dl\u00e1nky","aby","jsme","p\u0159ed","pta","jejich",
  76         "byl","je\u0161t\u011b","a\u017e","bez","tak\u00e9","pouze","prvn\u00ed","va\u0161e","kter\u00e1",
  77         "n\u00e1s","nov\u00fd","tipy","pokud","m\u016f\u017ee","strana","jeho","sv\u00e9","jin\u00e9",
  78         "zpr\u00e1vy","nov\u00e9","nen\u00ed","v\u00e1s","jen","podle","zde","u\u017e","b\u00fdt","v\u00edce",
  79         "bude","ji\u017e","ne\u017e","kter\u00fd","by","kter\u00e9","co","nebo","ten","tak",
  80         "m\u00e1","p\u0159i","od","po","jsou","jak","dal\u0161\u00ed","ale","si","se","ve",
  81         "to","jako","za","zp\u011bt","ze","do","pro","je","na","atd","atp",
  82         "jakmile","p\u0159i\u010dem\u017e","j\u00e1","on","ona","ono","oni","ony","my","vy",
  83         "j\u00ed","ji","m\u011b","mne","jemu","tomu","t\u011bm","t\u011bmu","n\u011bmu","n\u011bmu\u017e",
  84         "jeho\u017e","j\u00ed\u017e","jeliko\u017e","je\u017e","jako\u017e","na\u010de\u017e",
  85     };
  86
  87   /** File containing default Czech stopwords. */
  88   public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
  89
  90   /**
  91    * Returns a set of default Czech-stopwords
  92    *
  93    * @return a set of default Czech-stopwords
  94    */
  95         public static final Set<?> getDefaultStopSet(){
  96           return DefaultSetHolder.DEFAULT_SET;
  97         }
  98
  99         private static class DefaultSetHolder {
 100           private static final Set<?> DEFAULT_SET;
 101
 102           static {
 103             try {
 104               DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class,
 105                   DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
 106             } catch (IOException ex) {
 107               // default set should always be present as it is part of the
 108               // distribution (JAR)
 109               throw new RuntimeException("Unable to load default stopword set");
 110             }
 111           }
 112         }
 113
 114
 115   /**
 116    * Contains the stopwords used with the {@link StopFilter}.
 117    */
 118         // TODO once loadStopWords is gone those member should be removed too in favor of StopwordAnalyzerBase
 119         private Set<?> stoptable;
 120   private final Version matchVersion;
 121   private final Set<?> stemExclusionTable;
 122
 123   /**
 124    * Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
 125    *
 126    * @param matchVersion Lucene version to match See
 127    *          {@link <a href="#version">above</a>}
 128    */
 129         public CzechAnalyzer(Version matchVersion) {
 130     this(matchVersion, DefaultSetHolder.DEFAULT_SET);
 131         }
 132
 133   /**
 134    * Builds an analyzer with the given stop words.
 135    *
 136    * @param matchVersion Lucene version to match See
 137    *          {@link <a href="#version">above</a>}
 138    * @param stopwords a stopword set
 139    */
 140   public CzechAnalyzer(Version matchVersion, Set<?> stopwords) {
 141     this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
 142   }
 143
 144   /**
 145    * Builds an analyzer with the given stop words and a set of work to be
 146    * excluded from the {@link CzechStemFilter}.
 147    *
 148    * @param matchVersion Lucene version to match See
 149    *          {@link <a href="#version">above</a>}
 150    * @param stopwords a stopword set
 151    * @param stemExclusionTable a stemming exclusion set
 152    */
 153   public CzechAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable) {
 154     this.matchVersion = matchVersion;
 155     this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
 156     this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
 157   }
 158
 159
 160   /**
 161    * Builds an analyzer with the given stop words.
 162    *
 163    * @param matchVersion Lucene version to match See
 164    *          {@link <a href="#version">above</a>}
 165    * @param stopwords a stopword set
 166    * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
 167    */
 168   @Deprecated
 169   public CzechAnalyzer(Version matchVersion, String... stopwords) {
 170     this(matchVersion, StopFilter.makeStopSet( matchVersion, stopwords ));
 171         }
 172
 173   /**
 174    * Builds an analyzer with the given stop words.
 175    *
 176    * @param matchVersion Lucene version to match See
 177    *          {@link <a href="#version">above</a>}
 178    * @param stopwords a stopword set
 179    * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
 180    */
 181   @Deprecated
 182   public CzechAnalyzer(Version matchVersion, HashSet<?> stopwords) {
 183     this(matchVersion, (Set<?>)stopwords);
 184         }
 185
 186   /**
 187    * Builds an analyzer with the given stop words.
 188    *
 189    * @param matchVersion Lucene version to match See
 190    *          {@link <a href="#version">above</a>}
 191    * @param stopwords a file containing stopwords
 192    * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
 193    */
 194   @Deprecated
 195   public CzechAnalyzer(Version matchVersion, File stopwords ) throws IOException {
 196     this(matchVersion, (Set<?>)WordlistLoader.getWordSet(
 197         IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8), matchVersion));
 198         }
 199
 200     /**
 201      * Loads stopwords hash from resource stream (file, database...).
 202      * @param   wordfile    File containing the wordlist
 203      * @param   encoding    Encoding used (win-1250, iso-8859-2, ...), null for default system encoding
 204      * @deprecated use {@link WordlistLoader#getWordSet(Reader, String, Version) }
 205      *             and {@link #CzechAnalyzer(Version, Set)} instead
 206      */
 207     // TODO extend StopwordAnalyzerBase once this method is gone!
 208     @Deprecated
 209     public void loadStopWords( InputStream wordfile, String encoding ) {
 210         setPreviousTokenStream(null); // force a new stopfilter to be created
 211         if ( wordfile == null ) {
 212             stoptable = CharArraySet.EMPTY_SET;
 213             return;
 214         }
 215         try {
 216             // clear any previous table (if present)
 217             stoptable = CharArraySet.EMPTY_SET;
 218             stoptable = WordlistLoader.getWordSet(IOUtils.getDecodingReader(wordfile,
 219                 encoding == null ? IOUtils.CHARSET_UTF_8 : Charset.forName(encoding)), matchVersion);
 220         } catch ( IOException e ) {
 221           // clear any previous table (if present)
 222           // TODO: throw IOException
 223           stoptable = Collections.emptySet();
 224         }
 225     }
 226
 227   /**
 228    * Creates
 229    * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
 230    * used to tokenize all the text in the provided {@link Reader}.
 231    *
 232    * @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
 233    *         built from a {@link StandardTokenizer} filtered with
 234    *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
 235    *         , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If
 236    *         a version is >= LUCENE_31 and a stem exclusion set is provided via
 237    *         {@link #CzechAnalyzer(Version, Set, Set)} a
 238    *         {@link KeywordMarkerFilter} is added before
 239    *         {@link CzechStemFilter}.
 240    */
 241   @Override
 242   protected TokenStreamComponents createComponents(String fieldName,
 243       Reader reader) {
 244     final Tokenizer source = new StandardTokenizer(matchVersion, reader);
 245     TokenStream result = new StandardFilter(matchVersion, source);
 246     result = new LowerCaseFilter(matchVersion, result);
 247     result = new StopFilter( matchVersion, result, stoptable);
 248     if (matchVersion.onOrAfter(Version.LUCENE_31)) {
 249       if(!this.stemExclusionTable.isEmpty())
 250         result = new KeywordMarkerFilter(result, stemExclusionTable);
 251       result = new CzechStemFilter(result);
 252     }
 253     return new TokenStreamComponents(source, result);
 254   }
 255 }
 256