1 package org.apache.lucene.analysis;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.BufferedReader;
22 import java.io.FileReader;
23 import java.io.IOException;
24 import java.io.InputStreamReader;
25 import java.io.Reader;
26 import java.util.HashMap;
27 import java.util.HashSet;
31 * Loader for text files that represent a list of stopwords.
33 public class WordlistLoader {
36 * Loads a text file associated with a given class (See
37 * {@link Class#getResourceAsStream(String)}) and adds every line as an entry
38 * to a {@link Set} (omitting leading and trailing whitespace). Every line of
39 * the file should contain only one word. The words need to be in lower-case if
40 * you make use of an Analyzer which uses LowerCaseFilter (like
44 * a class that is associated with the given stopwordResource
45 * @param stopwordResource
46 * name of the resource file associated with the given class
47 * @return a {@link Set} with the file's words
49 public static Set<String> getWordSet(Class<?> aClass, String stopwordResource)
51 final Reader reader = new BufferedReader(new InputStreamReader(aClass
52 .getResourceAsStream(stopwordResource), "UTF-8"));
54 return getWordSet(reader);
61 * Loads a text file associated with a given class (See
62 * {@link Class#getResourceAsStream(String)}) and adds every line as an entry
63 * to a {@link Set} (omitting leading and trailing whitespace). Every line of
64 * the file should contain only one word. The words need to be in lower-case if
65 * you make use of an Analyzer which uses LowerCaseFilter (like
69 * a class that is associated with the given stopwordResource
70 * @param stopwordResource
71 * name of the resource file associated with the given class
73 * the comment string to ignore
74 * @return a {@link Set} with the file's words
76 public static Set<String> getWordSet(Class<?> aClass,
77 String stopwordResource, String comment) throws IOException {
78 final Reader reader = new BufferedReader(new InputStreamReader(aClass
79 .getResourceAsStream(stopwordResource), "UTF-8"));
81 return getWordSet(reader, comment);
88 * Loads a text file and adds every line as an entry to a HashSet (omitting
89 * leading and trailing whitespace). Every line of the file should contain only
90 * one word. The words need to be in lowercase if you make use of an
91 * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
93 * @param wordfile File containing the wordlist
94 * @return A HashSet with the file's words
96 public static HashSet<String> getWordSet(File wordfile) throws IOException {
97 FileReader reader = null;
99 reader = new FileReader(wordfile);
100 return getWordSet(reader);
109 * Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
110 * leading and trailing whitespace). Every line of the file should contain only
111 * one word. The words need to be in lowercase if you make use of an
112 * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
114 * @param wordfile File containing the wordlist
115 * @param comment The comment string to ignore
116 * @return A HashSet with the file's words
118 public static HashSet<String> getWordSet(File wordfile, String comment) throws IOException {
119 FileReader reader = null;
121 reader = new FileReader(wordfile);
122 return getWordSet(reader, comment);
132 * Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
133 * leading and trailing whitespace). Every line of the Reader should contain only
134 * one word. The words need to be in lowercase if you make use of an
135 * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
137 * @param reader Reader containing the wordlist
138 * @return A HashSet with the reader's words
140 public static HashSet<String> getWordSet(Reader reader) throws IOException {
141 final HashSet<String> result = new HashSet<String>();
142 BufferedReader br = null;
144 if (reader instanceof BufferedReader) {
145 br = (BufferedReader) reader;
147 br = new BufferedReader(reader);
150 while ((word = br.readLine()) != null) {
151 result.add(word.trim());
162 * Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
163 * leading and trailing whitespace). Every line of the Reader should contain only
164 * one word. The words need to be in lowercase if you make use of an
165 * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
167 * @param reader Reader containing the wordlist
168 * @param comment The string representing a comment.
169 * @return A HashSet with the reader's words
171 public static HashSet<String> getWordSet(Reader reader, String comment) throws IOException {
172 final HashSet<String> result = new HashSet<String>();
173 BufferedReader br = null;
175 if (reader instanceof BufferedReader) {
176 br = (BufferedReader) reader;
178 br = new BufferedReader(reader);
181 while ((word = br.readLine()) != null) {
182 if (word.startsWith(comment) == false){
183 result.add(word.trim());
195 * Loads a text file in Snowball format associated with a given class (See
196 * {@link Class#getResourceAsStream(String)}) and adds all words as entries to
197 * a {@link Set}. The words need to be in lower-case if you make use of an
198 * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
200 * @param aClass a class that is associated with the given stopwordResource
201 * @param stopwordResource name of the resource file associated with the given
203 * @return a {@link Set} with the file's words
204 * @see #getSnowballWordSet(Reader)
206 public static Set<String> getSnowballWordSet(Class<?> aClass,
207 String stopwordResource) throws IOException {
208 final Reader reader = new BufferedReader(new InputStreamReader(aClass
209 .getResourceAsStream(stopwordResource), "UTF-8"));
211 return getSnowballWordSet(reader);
218 * Reads stopwords from a stopword list in Snowball format.
220 * The snowball format is the following:
222 * <li>Lines may contain multiple words separated by whitespace.
223 * <li>The comment character is the vertical line (|).
224 * <li>Lines may contain trailing comments.
228 * @param reader Reader containing a Snowball stopword list
229 * @return A Set with the reader's words
231 public static Set<String> getSnowballWordSet(Reader reader)
233 final Set<String> result = new HashSet<String>();
234 BufferedReader br = null;
236 if (reader instanceof BufferedReader) {
237 br = (BufferedReader) reader;
239 br = new BufferedReader(reader);
242 while ((line = br.readLine()) != null) {
243 int comment = line.indexOf('|');
244 if (comment >= 0) line = line.substring(0, comment);
245 String words[] = line.split("\\s+");
246 for (int i = 0; i < words.length; i++)
247 if (words[i].length() > 0) result.add(words[i]);
250 if (br != null) br.close();
257 * Reads a stem dictionary. Each line contains:
258 * <pre>word<b>\t</b>stem</pre>
259 * (i.e. two tab separated words)
261 * @return stem dictionary that overrules the stemming algorithm
262 * @throws IOException
264 public static HashMap<String, String> getStemDict(File wordstemfile) throws IOException {
265 if (wordstemfile == null)
266 throw new NullPointerException("wordstemfile may not be null");
267 final HashMap<String, String> result = new HashMap<String,String>();
268 BufferedReader br = null;
271 br = new BufferedReader(new FileReader(wordstemfile));
273 while ((line = br.readLine()) != null) {
274 String[] wordstem = line.split("\t", 2);
275 result.put(wordstem[0], wordstem[1]);