lucene-java-3.5.0/lucene/src/java/org/apache/lucene/analysis/WordlistLoader.java

   1 package org.apache.lucene.analysis;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.BufferedReader;
  21 import java.io.IOException;
  22 import java.io.Reader;
  23
  24 import org.apache.lucene.util.IOUtils;
  25 import org.apache.lucene.util.Version;
  26
  27 /**
  28  * Loader for text files that represent a list of stopwords.
  29  *
  30  * @see IOUtils to obtain {@link Reader} instances
  31  * @lucene.internal
  32  */
  33 public class WordlistLoader {
  34
  35   private static final int INITITAL_CAPACITY = 16;
  36
  37   /**
  38    * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
  39    * leading and trailing whitespace). Every line of the Reader should contain only
  40    * one word. The words need to be in lowercase if you make use of an
  41    * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
  42    *
  43    * @param reader Reader containing the wordlist
  44    * @param result the {@link CharArraySet} to fill with the readers words
  45    * @return the given {@link CharArraySet} with the reader's words
  46    */
  47   public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
  48     BufferedReader br = null;
  49     try {
  50       br = getBufferedReader(reader);
  51       String word = null;
  52       while ((word = br.readLine()) != null) {
  53         result.add(word.trim());
  54       }
  55     }
  56     finally {
  57       IOUtils.close(br);
  58     }
  59     return result;
  60   }
  61
  62   /**
  63    * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
  64    * leading and trailing whitespace). Every line of the Reader should contain only
  65    * one word. The words need to be in lowercase if you make use of an
  66    * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
  67    *
  68    * @param reader Reader containing the wordlist
  69    * @param matchVersion the Lucene {@link Version}
  70    * @return A {@link CharArraySet} with the reader's words
  71    */
  72   public static CharArraySet getWordSet(Reader reader, Version matchVersion) throws IOException {
  73     return getWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
  74   }
  75
  76   /**
  77    * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
  78    * leading and trailing whitespace). Every line of the Reader should contain only
  79    * one word. The words need to be in lowercase if you make use of an
  80    * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
  81    *
  82    * @param reader Reader containing the wordlist
  83    * @param comment The string representing a comment.
  84    * @param matchVersion the Lucene {@link Version}
  85    * @return A CharArraySet with the reader's words
  86    */
  87   public static CharArraySet getWordSet(Reader reader, String comment, Version matchVersion) throws IOException {
  88     return getWordSet(reader, comment, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
  89   }
  90
  91   /**
  92    * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
  93    * leading and trailing whitespace). Every line of the Reader should contain only
  94    * one word. The words need to be in lowercase if you make use of an
  95    * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
  96    *
  97    * @param reader Reader containing the wordlist
  98    * @param comment The string representing a comment.
  99    * @param result the {@link CharArraySet} to fill with the readers words
 100    * @return the given {@link CharArraySet} with the reader's words
 101    */
 102   public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException {
 103     BufferedReader br = null;
 104     try {
 105       br = getBufferedReader(reader);
 106       String word = null;
 107       while ((word = br.readLine()) != null) {
 108         if (word.startsWith(comment) == false){
 109           result.add(word.trim());
 110         }
 111       }
 112     }
 113     finally {
 114       IOUtils.close(br);
 115     }
 116     return result;
 117   }
 118
 119
 120   /**
 121    * Reads stopwords from a stopword list in Snowball format.
 122    * <p>
 123    * The snowball format is the following:
 124    * <ul>
 125    * <li>Lines may contain multiple words separated by whitespace.
 126    * <li>The comment character is the vertical line (&#124;).
 127    * <li>Lines may contain trailing comments.
 128    * </ul>
 129    * </p>
 130    *
 131    * @param reader Reader containing a Snowball stopword list
 132    * @param result the {@link CharArraySet} to fill with the readers words
 133    * @return the given {@link CharArraySet} with the reader's words
 134    */
 135   public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
 136       throws IOException {
 137     BufferedReader br = null;
 138     try {
 139       br = getBufferedReader(reader);
 140       String line = null;
 141       while ((line = br.readLine()) != null) {
 142         int comment = line.indexOf('|');
 143         if (comment >= 0) line = line.substring(0, comment);
 144         String words[] = line.split("\\s+");
 145         for (int i = 0; i < words.length; i++)
 146           if (words[i].length() > 0) result.add(words[i]);
 147       }
 148     } finally {
 149       IOUtils.close(br);
 150     }
 151     return result;
 152   }
 153
 154   /**
 155    * Reads stopwords from a stopword list in Snowball format.
 156    * <p>
 157    * The snowball format is the following:
 158    * <ul>
 159    * <li>Lines may contain multiple words separated by whitespace.
 160    * <li>The comment character is the vertical line (&#124;).
 161    * <li>Lines may contain trailing comments.
 162    * </ul>
 163    * </p>
 164    *
 165    * @param reader Reader containing a Snowball stopword list
 166    * @param matchVersion the Lucene {@link Version}
 167    * @return A {@link CharArraySet} with the reader's words
 168    */
 169   public static CharArraySet getSnowballWordSet(Reader reader, Version matchVersion) throws IOException {
 170     return getSnowballWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
 171   }
 172
 173
 174   /**
 175    * Reads a stem dictionary. Each line contains:
 176    * <pre>word<b>\t</b>stem</pre>
 177    * (i.e. two tab separated words)
 178    *
 179    * @return stem dictionary that overrules the stemming algorithm
 180    * @throws IOException
 181    */
 182   public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result) throws IOException {
 183     BufferedReader br = null;
 184     try {
 185       br = getBufferedReader(reader);
 186       String line;
 187       while ((line = br.readLine()) != null) {
 188         String[] wordstem = line.split("\t", 2);
 189         result.put(wordstem[0], wordstem[1]);
 190       }
 191     } finally {
 192       IOUtils.close(br);
 193     }
 194     return result;
 195   }
 196
 197   private static BufferedReader getBufferedReader(Reader reader) {
 198     return (reader instanceof BufferedReader) ? (BufferedReader) reader
 199         : new BufferedReader(reader);
 200   }
 201
 202 }