1 package org.apache.lucene.analysis.compound;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
23 import org.apache.lucene.analysis.Token;
24 import org.apache.lucene.analysis.TokenFilter; // for javadocs
25 import org.apache.lucene.analysis.TokenStream;
26 import org.apache.lucene.util.Version;
29 * A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
31 * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
32 * "Donaudampfschiff" even when you only enter "schiff".
33 * It uses a brute-force algorithm to achieve this.
36 public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
39 * Creates a new {@link DictionaryCompoundWordTokenFilter}
41 * @param input the {@link TokenStream} to process
42 * @param dictionary the word dictionary to match against
43 * @param minWordSize only words longer than this get processed
44 * @param minSubwordSize only subwords longer than this get to the output stream
45 * @param maxSubwordSize only subwords shorter than this get to the output stream
46 * @param onlyLongestMatch Add only the longest matching subword to the stream
47 * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, String[], int, int, int, boolean)} instead
50 public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary,
51 int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
52 super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
56 * Creates a new {@link DictionaryCompoundWordTokenFilter}
58 * @param input the {@link TokenStream} to process
59 * @param dictionary the word dictionary to match against
60 * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, String[])} instead
63 public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary) {
64 super(Version.LUCENE_30, input, dictionary);
68 * Creates a new {@link DictionaryCompoundWordTokenFilter}
70 * @param input the {@link TokenStream} to process
71 * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
73 * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, Set)} instead
76 public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary) {
77 super(Version.LUCENE_30, input, dictionary);
81 * Creates a new {@link DictionaryCompoundWordTokenFilter}
83 * @param input the {@link TokenStream} to process
84 * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
86 * @param minWordSize only words longer than this get processed
87 * @param minSubwordSize only subwords longer than this get to the output stream
88 * @param maxSubwordSize only subwords shorter than this get to the output stream
89 * @param onlyLongestMatch Add only the longest matching subword to the stream
90 * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, Set, int, int, int, boolean)} instead
93 public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary,
94 int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
95 super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
99 * Creates a new {@link DictionaryCompoundWordTokenFilter}
101 * @param matchVersion
102 * Lucene version to enable correct Unicode 4.0 behavior in the
103 * dictionaries if Version > 3.0. See <a
104 * href="CompoundWordTokenFilterBase#version"
105 * >CompoundWordTokenFilterBase</a> for details.
107 * the {@link TokenStream} to process
109 * the word dictionary to match against
111 * only words longer than this get processed
112 * @param minSubwordSize
113 * only subwords longer than this get to the output stream
114 * @param maxSubwordSize
115 * only subwords shorter than this get to the output stream
116 * @param onlyLongestMatch
117 * Add only the longest matching subword to the stream
119 public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary,
120 int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
121 super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
125 * Creates a new {@link DictionaryCompoundWordTokenFilter}
127 * @param matchVersion
128 * Lucene version to enable correct Unicode 4.0 behavior in the
129 * dictionaries if Version > 3.0. See <a
130 * href="CompoundWordTokenFilterBase#version"
131 * >CompoundWordTokenFilterBase</a> for details.
134 * the {@link TokenStream} to process
136 * the word dictionary to match against
138 public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary) {
139 super(matchVersion, input, dictionary);
143 * Creates a new {@link DictionaryCompoundWordTokenFilter}
145 * @param matchVersion
146 * Lucene version to enable correct Unicode 4.0 behavior in the
147 * dictionaries if Version > 3.0. See <a
148 * href="CompoundWordTokenFilterBase#version"
149 * >CompoundWordTokenFilterBase</a> for details.
151 * the {@link TokenStream} to process
153 * the word dictionary to match against. If this is a
154 * {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
155 * must have set ignoreCase=false and only contain lower case
158 public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary) {
159 super(matchVersion, input, dictionary);
163 * Creates a new {@link DictionaryCompoundWordTokenFilter}
165 * @param matchVersion
166 * Lucene version to enable correct Unicode 4.0 behavior in the
167 * dictionaries if Version > 3.0. See <a
168 * href="CompoundWordTokenFilterBase#version"
169 * >CompoundWordTokenFilterBase</a> for details.
171 * the {@link TokenStream} to process
173 * the word dictionary to match against. If this is a
174 * {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
175 * must have set ignoreCase=false and only contain lower case
178 * only words longer than this get processed
179 * @param minSubwordSize
180 * only subwords longer than this get to the output stream
181 * @param maxSubwordSize
182 * only subwords shorter than this get to the output stream
183 * @param onlyLongestMatch
184 * Add only the longest matching subword to the stream
186 public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary,
187 int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
188 super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
192 protected void decomposeInternal(final Token token) {
193 // Only words longer than minWordSize get processed
194 if (token.length() < this.minWordSize) {
198 char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
200 for (int i=0;i<token.length()-this.minSubwordSize;++i) {
201 Token longestMatchToken=null;
202 for (int j=this.minSubwordSize-1;j<this.maxSubwordSize;++j) {
203 if(i+j>token.length()) {
206 if(dictionary.contains(lowerCaseTermBuffer, i, j)) {
207 if (this.onlyLongestMatch) {
208 if (longestMatchToken!=null) {
209 if (longestMatchToken.length()<j) {
210 longestMatchToken=createToken(i,j,token);
213 longestMatchToken=createToken(i,j,token);
216 tokens.add(createToken(i,j,token));
220 if (this.onlyLongestMatch && longestMatchToken!=null) {
221 tokens.add(longestMatchToken);