1 package org.apache.lucene.analysis;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.BufferedReader;
21 import java.io.IOException;
22 import java.io.Reader;
24 import org.apache.lucene.util.IOUtils;
25 import org.apache.lucene.util.Version;
28 * Loader for text files that represent a list of stopwords.
30 * @see IOUtils to obtain {@link Reader} instances
33 public class WordlistLoader {
35 private static final int INITITAL_CAPACITY = 16;
38 * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
39 * leading and trailing whitespace). Every line of the Reader should contain only
40 * one word. The words need to be in lowercase if you make use of an
41 * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
43 * @param reader Reader containing the wordlist
44 * @param result the {@link CharArraySet} to fill with the readers words
45 * @return the given {@link CharArraySet} with the reader's words
47 public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
48 BufferedReader br = null;
50 br = getBufferedReader(reader);
52 while ((word = br.readLine()) != null) {
53 result.add(word.trim());
63 * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
64 * leading and trailing whitespace). Every line of the Reader should contain only
65 * one word. The words need to be in lowercase if you make use of an
66 * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
68 * @param reader Reader containing the wordlist
69 * @param matchVersion the Lucene {@link Version}
70 * @return A {@link CharArraySet} with the reader's words
72 public static CharArraySet getWordSet(Reader reader, Version matchVersion) throws IOException {
73 return getWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
77 * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
78 * leading and trailing whitespace). Every line of the Reader should contain only
79 * one word. The words need to be in lowercase if you make use of an
80 * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
82 * @param reader Reader containing the wordlist
83 * @param comment The string representing a comment.
84 * @param matchVersion the Lucene {@link Version}
85 * @return A CharArraySet with the reader's words
87 public static CharArraySet getWordSet(Reader reader, String comment, Version matchVersion) throws IOException {
88 return getWordSet(reader, comment, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
92 * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
93 * leading and trailing whitespace). Every line of the Reader should contain only
94 * one word. The words need to be in lowercase if you make use of an
95 * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
97 * @param reader Reader containing the wordlist
98 * @param comment The string representing a comment.
99 * @param result the {@link CharArraySet} to fill with the readers words
100 * @return the given {@link CharArraySet} with the reader's words
102 public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException {
103 BufferedReader br = null;
105 br = getBufferedReader(reader);
107 while ((word = br.readLine()) != null) {
108 if (word.startsWith(comment) == false){
109 result.add(word.trim());
121 * Reads stopwords from a stopword list in Snowball format.
123 * The snowball format is the following:
125 * <li>Lines may contain multiple words separated by whitespace.
126 * <li>The comment character is the vertical line (|).
127 * <li>Lines may contain trailing comments.
131 * @param reader Reader containing a Snowball stopword list
132 * @param result the {@link CharArraySet} to fill with the readers words
133 * @return the given {@link CharArraySet} with the reader's words
135 public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
137 BufferedReader br = null;
139 br = getBufferedReader(reader);
141 while ((line = br.readLine()) != null) {
142 int comment = line.indexOf('|');
143 if (comment >= 0) line = line.substring(0, comment);
144 String words[] = line.split("\\s+");
145 for (int i = 0; i < words.length; i++)
146 if (words[i].length() > 0) result.add(words[i]);
155 * Reads stopwords from a stopword list in Snowball format.
157 * The snowball format is the following:
159 * <li>Lines may contain multiple words separated by whitespace.
160 * <li>The comment character is the vertical line (|).
161 * <li>Lines may contain trailing comments.
165 * @param reader Reader containing a Snowball stopword list
166 * @param matchVersion the Lucene {@link Version}
167 * @return A {@link CharArraySet} with the reader's words
169 public static CharArraySet getSnowballWordSet(Reader reader, Version matchVersion) throws IOException {
170 return getSnowballWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
175 * Reads a stem dictionary. Each line contains:
176 * <pre>word<b>\t</b>stem</pre>
177 * (i.e. two tab separated words)
179 * @return stem dictionary that overrules the stemming algorithm
180 * @throws IOException
182 public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result) throws IOException {
183 BufferedReader br = null;
185 br = getBufferedReader(reader);
187 while ((line = br.readLine()) != null) {
188 String[] wordstem = line.split("\t", 2);
189 result.put(wordstem[0], wordstem[1]);
197 private static BufferedReader getBufferedReader(Reader reader) {
198 return (reader instanceof BufferedReader) ? (BufferedReader) reader
199 : new BufferedReader(reader);