1 package org.apache.lucene.analysis.compound;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
23 import org.apache.lucene.analysis.TokenFilter; // for javadocs
24 import org.apache.lucene.analysis.TokenStream;
25 import org.apache.lucene.util.Version;
28 * A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
30 * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
31 * "Donaudampfschiff" even when you only enter "schiff".
32 * It uses a brute-force algorithm to achieve this.
34 * You must specify the required {@link Version} compatibility when creating
35 * CompoundWordTokenFilterBase:
37 * <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
38 * supplementary characters in strings and char arrays provided as compound word
41 * <p>If you pass in a {@link org.apache.lucene.analysis.CharArraySet} as dictionary,
42 * it should be case-insensitive unless it contains only lowercased entries and you
43 * have {@link org.apache.lucene.analysis.LowerCaseFilter} before this filter in your analysis chain.
44 * For optional performance (as this filter does lots of lookups to the dictionary,
45 * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
46 * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
47 * transformed to case-insensitive!
49 public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
52 * Creates a new {@link DictionaryCompoundWordTokenFilter}.
53 * @param input the {@link TokenStream} to process
54 * @param dictionary the word dictionary to match against
55 * @param minWordSize only words longer than this get processed
56 * @param minSubwordSize only subwords longer than this get to the output stream
57 * @param maxSubwordSize only subwords shorter than this get to the output stream
58 * @param onlyLongestMatch Add only the longest matching subword to the stream
59 * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, String[], int, int, int, boolean)} instead
62 public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary,
63 int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
64 super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
68 * Creates a new {@link DictionaryCompoundWordTokenFilter}
70 * @param input the {@link TokenStream} to process
71 * @param dictionary the word dictionary to match against
72 * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, String[])} instead
75 public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary) {
76 super(Version.LUCENE_30, input, dictionary);
80 * Creates a new {@link DictionaryCompoundWordTokenFilter}
82 * @param input the {@link TokenStream} to process
83 * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
85 * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, Set)} instead
88 public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary) {
89 super(Version.LUCENE_30, input, dictionary);
93 * Creates a new {@link DictionaryCompoundWordTokenFilter}
95 * @param input the {@link TokenStream} to process
96 * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
98 * @param minWordSize only words longer than this get processed
99 * @param minSubwordSize only subwords longer than this get to the output stream
100 * @param maxSubwordSize only subwords shorter than this get to the output stream
101 * @param onlyLongestMatch Add only the longest matching subword to the stream
102 * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, Set, int, int, int, boolean)} instead
105 public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary,
106 int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
107 super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
111 * Creates a new {@link DictionaryCompoundWordTokenFilter}
113 * @param matchVersion
114 * Lucene version to enable correct Unicode 4.0 behavior in the
115 * dictionaries if Version > 3.0. See <a
116 * href="CompoundWordTokenFilterBase#version"
117 * >CompoundWordTokenFilterBase</a> for details.
119 * the {@link TokenStream} to process
121 * the word dictionary to match against
123 * only words longer than this get processed
124 * @param minSubwordSize
125 * only subwords longer than this get to the output stream
126 * @param maxSubwordSize
127 * only subwords shorter than this get to the output stream
128 * @param onlyLongestMatch
129 * Add only the longest matching subword to the stream
130 * @deprecated Use the constructors taking {@link Set}
133 public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary,
134 int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
135 super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
139 * Creates a new {@link DictionaryCompoundWordTokenFilter}
141 * @param matchVersion
142 * Lucene version to enable correct Unicode 4.0 behavior in the
143 * dictionaries if Version > 3.0. See <a
144 * href="CompoundWordTokenFilterBase#version"
145 * >CompoundWordTokenFilterBase</a> for details.
148 * the {@link TokenStream} to process
150 * the word dictionary to match against
151 * @deprecated Use the constructors taking {@link Set}
154 public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary) {
155 super(matchVersion, input, dictionary);
159 * Creates a new {@link DictionaryCompoundWordTokenFilter}
161 * @param matchVersion
162 * Lucene version to enable correct Unicode 4.0 behavior in the
163 * dictionaries if Version > 3.0. See <a
164 * href="CompoundWordTokenFilterBase#version"
165 * >CompoundWordTokenFilterBase</a> for details.
167 * the {@link TokenStream} to process
169 * the word dictionary to match against.
171 public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary) {
172 super(matchVersion, input, dictionary);
176 * Creates a new {@link DictionaryCompoundWordTokenFilter}
178 * @param matchVersion
179 * Lucene version to enable correct Unicode 4.0 behavior in the
180 * dictionaries if Version > 3.0. See <a
181 * href="CompoundWordTokenFilterBase#version"
182 * >CompoundWordTokenFilterBase</a> for details.
184 * the {@link TokenStream} to process
186 * the word dictionary to match against.
188 * only words longer than this get processed
189 * @param minSubwordSize
190 * only subwords longer than this get to the output stream
191 * @param maxSubwordSize
192 * only subwords shorter than this get to the output stream
193 * @param onlyLongestMatch
194 * Add only the longest matching subword to the stream
196 public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary,
197 int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
198 super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
202 protected void decompose() {
203 final int len = termAtt.length();
204 for (int i=0;i<=len-this.minSubwordSize;++i) {
205 CompoundToken longestMatchToken=null;
206 for (int j=this.minSubwordSize;j<=this.maxSubwordSize;++j) {
210 if(dictionary.contains(termAtt.buffer(), i, j)) {
211 if (this.onlyLongestMatch) {
212 if (longestMatchToken!=null) {
213 if (longestMatchToken.txt.length()<j) {
214 longestMatchToken=new CompoundToken(i,j);
217 longestMatchToken=new CompoundToken(i,j);
220 tokens.add(new CompoundToken(i,j));
224 if (this.onlyLongestMatch && longestMatchToken!=null) {
225 tokens.add(longestMatchToken);