lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/WordlistLoader.java

   1 package org.apache.lucene.analysis;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.BufferedReader;
  21 import java.io.File;
  22 import java.io.FileReader;
  23 import java.io.IOException;
  24 import java.io.InputStreamReader;
  25 import java.io.Reader;
  26 import java.util.HashMap;
  27 import java.util.HashSet;
  28 import java.util.Set;
  29
  30 /**
  31  * Loader for text files that represent a list of stopwords.
  32  */
  33 public class WordlistLoader {
  34
  35   /**
  36    * Loads a text file associated with a given class (See
  37    * {@link Class#getResourceAsStream(String)}) and adds every line as an entry
  38    * to a {@link Set} (omitting leading and trailing whitespace). Every line of
  39    * the file should contain only one word. The words need to be in lower-case if
  40    * you make use of an Analyzer which uses LowerCaseFilter (like
  41    * StandardAnalyzer).
  42    *
  43    * @param aClass
  44    *          a class that is associated with the given stopwordResource
  45    * @param stopwordResource
  46    *          name of the resource file associated with the given class
  47    * @return a {@link Set} with the file's words
  48    */
  49   public static Set<String> getWordSet(Class<?> aClass, String stopwordResource)
  50       throws IOException {
  51     final Reader reader = new BufferedReader(new InputStreamReader(aClass
  52         .getResourceAsStream(stopwordResource), "UTF-8"));
  53     try {
  54       return getWordSet(reader);
  55     } finally {
  56       reader.close();
  57     }
  58   }
  59
  60   /**
  61    * Loads a text file associated with a given class (See
  62    * {@link Class#getResourceAsStream(String)}) and adds every line as an entry
  63    * to a {@link Set} (omitting leading and trailing whitespace). Every line of
  64    * the file should contain only one word. The words need to be in lower-case if
  65    * you make use of an Analyzer which uses LowerCaseFilter (like
  66    * StandardAnalyzer).
  67    *
  68    * @param aClass
  69    *          a class that is associated with the given stopwordResource
  70    * @param stopwordResource
  71    *          name of the resource file associated with the given class
  72    * @param comment
  73    *          the comment string to ignore
  74    * @return a {@link Set} with the file's words
  75    */
  76   public static Set<String> getWordSet(Class<?> aClass,
  77       String stopwordResource, String comment) throws IOException {
  78     final Reader reader = new BufferedReader(new InputStreamReader(aClass
  79         .getResourceAsStream(stopwordResource), "UTF-8"));
  80     try {
  81       return getWordSet(reader, comment);
  82     } finally {
  83       reader.close();
  84     }
  85   }
  86
  87   /**
  88    * Loads a text file and adds every line as an entry to a HashSet (omitting
  89    * leading and trailing whitespace). Every line of the file should contain only
  90    * one word. The words need to be in lowercase if you make use of an
  91    * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
  92    *
  93    * @param wordfile File containing the wordlist
  94    * @return A HashSet with the file's words
  95    */
  96   public static HashSet<String> getWordSet(File wordfile) throws IOException {
  97     FileReader reader = null;
  98     try {
  99       reader = new FileReader(wordfile);
 100       return getWordSet(reader);
 101     }
 102     finally {
 103       if (reader != null)
 104         reader.close();
 105     }
 106   }
 107
 108   /**
 109    * Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
 110    * leading and trailing whitespace). Every line of the file should contain only
 111    * one word. The words need to be in lowercase if you make use of an
 112    * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
 113    *
 114    * @param wordfile File containing the wordlist
 115    * @param comment The comment string to ignore
 116    * @return A HashSet with the file's words
 117    */
 118   public static HashSet<String> getWordSet(File wordfile, String comment) throws IOException {
 119     FileReader reader = null;
 120     try {
 121       reader = new FileReader(wordfile);
 122       return getWordSet(reader, comment);
 123     }
 124     finally {
 125       if (reader != null)
 126         reader.close();
 127     }
 128   }
 129
 130
 131   /**
 132    * Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
 133    * leading and trailing whitespace). Every line of the Reader should contain only
 134    * one word. The words need to be in lowercase if you make use of an
 135    * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
 136    *
 137    * @param reader Reader containing the wordlist
 138    * @return A HashSet with the reader's words
 139    */
 140   public static HashSet<String> getWordSet(Reader reader) throws IOException {
 141     final HashSet<String> result = new HashSet<String>();
 142     BufferedReader br = null;
 143     try {
 144       if (reader instanceof BufferedReader) {
 145         br = (BufferedReader) reader;
 146       } else {
 147         br = new BufferedReader(reader);
 148       }
 149       String word = null;
 150       while ((word = br.readLine()) != null) {
 151         result.add(word.trim());
 152       }
 153     }
 154     finally {
 155       if (br != null)
 156         br.close();
 157     }
 158     return result;
 159   }
 160
 161   /**
 162    * Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
 163    * leading and trailing whitespace). Every line of the Reader should contain only
 164    * one word. The words need to be in lowercase if you make use of an
 165    * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
 166    *
 167    * @param reader Reader containing the wordlist
 168    * @param comment The string representing a comment.
 169    * @return A HashSet with the reader's words
 170    */
 171   public static HashSet<String> getWordSet(Reader reader, String comment) throws IOException {
 172     final HashSet<String> result = new HashSet<String>();
 173     BufferedReader br = null;
 174     try {
 175       if (reader instanceof BufferedReader) {
 176         br = (BufferedReader) reader;
 177       } else {
 178         br = new BufferedReader(reader);
 179       }
 180       String word = null;
 181       while ((word = br.readLine()) != null) {
 182         if (word.startsWith(comment) == false){
 183           result.add(word.trim());
 184         }
 185       }
 186     }
 187     finally {
 188       if (br != null)
 189         br.close();
 190     }
 191     return result;
 192   }
 193
 194   /**
 195    * Loads a text file in Snowball format associated with a given class (See
 196    * {@link Class#getResourceAsStream(String)}) and adds all words as entries to
 197    * a {@link Set}. The words need to be in lower-case if you make use of an
 198    * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
 199    *
 200    * @param aClass a class that is associated with the given stopwordResource
 201    * @param stopwordResource name of the resource file associated with the given
 202    *          class
 203    * @return a {@link Set} with the file's words
 204    * @see #getSnowballWordSet(Reader)
 205    */
 206   public static Set<String> getSnowballWordSet(Class<?> aClass,
 207       String stopwordResource) throws IOException {
 208     final Reader reader = new BufferedReader(new InputStreamReader(aClass
 209         .getResourceAsStream(stopwordResource), "UTF-8"));
 210     try {
 211       return getSnowballWordSet(reader);
 212     } finally {
 213       reader.close();
 214     }
 215   }
 216
 217   /**
 218    * Reads stopwords from a stopword list in Snowball format.
 219    * <p>
 220    * The snowball format is the following:
 221    * <ul>
 222    * <li>Lines may contain multiple words separated by whitespace.
 223    * <li>The comment character is the vertical line (&#124;).
 224    * <li>Lines may contain trailing comments.
 225    * </ul>
 226    * </p>
 227    *
 228    * @param reader Reader containing a Snowball stopword list
 229    * @return A Set with the reader's words
 230    */
 231   public static Set<String> getSnowballWordSet(Reader reader)
 232       throws IOException {
 233     final Set<String> result = new HashSet<String>();
 234     BufferedReader br = null;
 235     try {
 236       if (reader instanceof BufferedReader) {
 237         br = (BufferedReader) reader;
 238       } else {
 239         br = new BufferedReader(reader);
 240       }
 241       String line = null;
 242       while ((line = br.readLine()) != null) {
 243         int comment = line.indexOf('|');
 244         if (comment >= 0) line = line.substring(0, comment);
 245         String words[] = line.split("\\s+");
 246         for (int i = 0; i < words.length; i++)
 247           if (words[i].length() > 0) result.add(words[i]);
 248       }
 249     } finally {
 250       if (br != null) br.close();
 251     }
 252     return result;
 253   }
 254
 255
 256   /**
 257    * Reads a stem dictionary. Each line contains:
 258    * <pre>word<b>\t</b>stem</pre>
 259    * (i.e. two tab separated words)
 260    *
 261    * @return stem dictionary that overrules the stemming algorithm
 262    * @throws IOException
 263    */
 264   public static HashMap<String, String> getStemDict(File wordstemfile) throws IOException {
 265     if (wordstemfile == null)
 266       throw new NullPointerException("wordstemfile may not be null");
 267     final HashMap<String, String> result = new HashMap<String,String>();
 268     BufferedReader br = null;
 269
 270     try {
 271       br = new BufferedReader(new FileReader(wordstemfile));
 272       String line;
 273       while ((line = br.readLine()) != null) {
 274         String[] wordstem = line.split("\t", 2);
 275         result.put(wordstem[0], wordstem[1]);
 276       }
 277     } finally {
 278       if(br != null)
 279         br.close();
 280     }
 281     return result;
 282   }
 283
 284 }