1 package org.apache.lucene.analysis.compound;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import java.io.Reader;
24 import org.apache.lucene.analysis.Token;
25 import org.apache.lucene.analysis.TokenFilter; // for javadocs
26 import org.apache.lucene.analysis.TokenStream;
27 import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
28 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
29 import org.apache.lucene.util.Version;
30 import org.xml.sax.InputSource;
33 * A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
35 * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
36 * "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
37 * grammar and a word dictionary to achieve this.
40 public class HyphenationCompoundWordTokenFilter extends
41 CompoundWordTokenFilterBase {
42 private HyphenationTree hyphenator;
45 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
48 * Lucene version to enable correct Unicode 4.0 behavior in the
49 * dictionaries if Version > 3.0. See <a
50 * href="CompoundWordTokenFilterBase#version"
51 * >CompoundWordTokenFilterBase</a> for details.
53 * the {@link TokenStream} to process
55 * the hyphenation pattern tree to use for hyphenation
57 * the word dictionary to match against
59 * only words longer than this get processed
60 * @param minSubwordSize
61 * only subwords longer than this get to the output stream
62 * @param maxSubwordSize
63 * only subwords shorter than this get to the output stream
64 * @param onlyLongestMatch
65 * Add only the longest matching subword to the stream
67 public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
68 HyphenationTree hyphenator, String[] dictionary, int minWordSize,
69 int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
70 this(input, hyphenator, makeDictionary(dictionary), minWordSize,
71 minSubwordSize, maxSubwordSize, onlyLongestMatch);
75 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
78 * Lucene version to enable correct Unicode 4.0 behavior in the
79 * dictionaries if Version > 3.0. See <a
80 * href="CompoundWordTokenFilterBase#version"
81 * >CompoundWordTokenFilterBase</a> for details.
83 * the {@link TokenStream} to process
85 * the hyphenation pattern tree to use for hyphenation
87 * the word dictionary to match against
89 public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
90 HyphenationTree hyphenator, String[] dictionary) {
91 this(input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
92 DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
96 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
99 * Lucene version to enable correct Unicode 4.0 behavior in the
100 * dictionaries if Version > 3.0. See <a
101 * href="CompoundWordTokenFilterBase#version"
102 * >CompoundWordTokenFilterBase</a> for details.
104 * the {@link TokenStream} to process
106 * the hyphenation pattern tree to use for hyphenation
108 * the word dictionary to match against. If this is a
109 * {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
110 * must have set ignoreCase=false and only contain lower case
113 public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
114 HyphenationTree hyphenator, Set<?> dictionary) {
115 this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
116 DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
120 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
122 * @param matchVersion
123 * Lucene version to enable correct Unicode 4.0 behavior in the
124 * dictionaries if Version > 3.0. See <a
125 * href="CompoundWordTokenFilterBase#version"
126 * >CompoundWordTokenFilterBase</a> for details.
128 * the {@link TokenStream} to process
130 * the hyphenation pattern tree to use for hyphenation
132 * the word dictionary to match against. If this is a
133 * {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
134 * must have set ignoreCase=false and only contain lower case
137 * only words longer than this get processed
138 * @param minSubwordSize
139 * only subwords longer than this get to the output stream
140 * @param maxSubwordSize
141 * only subwords shorter than this get to the output stream
142 * @param onlyLongestMatch
143 * Add only the longest matching subword to the stream
145 public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
146 HyphenationTree hyphenator, Set<?> dictionary, int minWordSize,
147 int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
148 super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
151 this.hyphenator = hyphenator;
155 * Create a HyphenationCompoundWordTokenFilter with no dictionary.
157 * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)
158 * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
159 * null, minWordSize, minSubwordSize, maxSubwordSize }
161 public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
162 HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
163 int maxSubwordSize) {
164 this(matchVersion, input, hyphenator, (Set<?>) null, minWordSize, minSubwordSize,
165 maxSubwordSize, false);
169 * Create a HyphenationCompoundWordTokenFilter with no dictionary.
171 * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, int, int, int)
172 * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
173 * DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
175 public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
176 HyphenationTree hyphenator) {
177 this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE,
178 DEFAULT_MAX_SUBWORD_SIZE);
182 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
184 * @param input the {@link TokenStream} to process
185 * @param hyphenator the hyphenation pattern tree to use for hyphenation
186 * @param dictionary the word dictionary to match against
187 * @param minWordSize only words longer than this get processed
188 * @param minSubwordSize only subwords longer than this get to the output
190 * @param maxSubwordSize only subwords shorter than this get to the output
192 * @param onlyLongestMatch Add only the longest matching subword to the stream
193 * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, String[], int, int, int, boolean)} instead.
196 public HyphenationCompoundWordTokenFilter(TokenStream input,
197 HyphenationTree hyphenator, String[] dictionary, int minWordSize,
198 int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
199 this(Version.LUCENE_30, input, hyphenator, makeDictionary(dictionary), minWordSize,
200 minSubwordSize, maxSubwordSize, onlyLongestMatch);
204 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
206 * @param input the {@link TokenStream} to process
207 * @param hyphenator the hyphenation pattern tree to use for hyphenation
208 * @param dictionary the word dictionary to match against
209 * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, String[])} instead.
212 public HyphenationCompoundWordTokenFilter(TokenStream input,
213 HyphenationTree hyphenator, String[] dictionary) {
214 this(Version.LUCENE_30, input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
215 DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
219 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
221 * @param input the {@link TokenStream} to process
222 * @param hyphenator the hyphenation pattern tree to use for hyphenation
223 * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
224 * lower case strings.
225 * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set)} instead.
228 public HyphenationCompoundWordTokenFilter(TokenStream input,
229 HyphenationTree hyphenator, Set<?> dictionary) {
230 this(Version.LUCENE_30, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
231 DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
235 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
237 * @param input the {@link TokenStream} to process
238 * @param hyphenator the hyphenation pattern tree to use for hyphenation
239 * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
240 * lower case strings.
241 * @param minWordSize only words longer than this get processed
242 * @param minSubwordSize only subwords longer than this get to the output
244 * @param maxSubwordSize only subwords shorter than this get to the output
246 * @param onlyLongestMatch Add only the longest matching subword to the stream
247 * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)} instead.
250 public HyphenationCompoundWordTokenFilter(TokenStream input,
251 HyphenationTree hyphenator, Set<?> dictionary, int minWordSize,
252 int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
253 super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
256 this.hyphenator = hyphenator;
260 * Create a hyphenator tree
262 * @param hyphenationFilename the filename of the XML grammar to load
263 * @return An object representing the hyphenation patterns
266 public static HyphenationTree getHyphenationTree(String hyphenationFilename)
268 return getHyphenationTree(new InputSource(hyphenationFilename));
272 * Create a hyphenator tree
274 * @param hyphenationFile the file of the XML grammar to load
275 * @return An object representing the hyphenation patterns
278 public static HyphenationTree getHyphenationTree(File hyphenationFile)
280 return getHyphenationTree(new InputSource(hyphenationFile.toURL().toExternalForm()));
284 * Create a hyphenator tree
286 * @param hyphenationReader the reader of the XML grammar to load from
287 * @return An object representing the hyphenation patterns
289 * @deprecated Don't use Readers with fixed charset to load XML files, unless programatically created.
290 * Use {@link #getHyphenationTree(InputSource)} instead, where you can supply default charset and input
291 * stream, if you like.
294 public static HyphenationTree getHyphenationTree(Reader hyphenationReader)
296 final InputSource is = new InputSource(hyphenationReader);
297 // we need this to load the DTD in very old parsers (like the one in JDK 1.4).
298 // The DTD itsself is provided via EntityResolver, so it should always load, but
299 // some parsers still want to have a base URL (Crimson).
300 is.setSystemId("urn:java:" + HyphenationTree.class.getName());
301 return getHyphenationTree(is);
305 * Create a hyphenator tree
307 * @param hyphenationSource the InputSource pointing to the XML grammar
308 * @return An object representing the hyphenation patterns
311 public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
313 HyphenationTree tree = new HyphenationTree();
314 tree.loadPatterns(hyphenationSource);
319 protected void decomposeInternal(final Token token) {
320 // get the hyphenation points
321 Hyphenation hyphens = hyphenator.hyphenate(token.buffer(), 0, token
323 // No hyphen points found -> exit
324 if (hyphens == null) {
328 final int[] hyp = hyphens.getHyphenationPoints();
329 char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
331 for (int i = 0; i < hyp.length; ++i) {
332 int remaining = hyp.length - i;
334 Token longestMatchToken = null;
335 for (int j = 1; j < remaining; j++) {
336 int partLength = hyp[i + j] - start;
338 // if the part is longer than maxSubwordSize we
339 // are done with this round
340 if (partLength > this.maxSubwordSize) {
344 // we only put subwords to the token stream
345 // that are longer than minPartSize
346 if (partLength < this.minSubwordSize) {
350 // check the dictionary
351 if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
352 if (this.onlyLongestMatch) {
353 if (longestMatchToken != null) {
354 if (longestMatchToken.length() < partLength) {
355 longestMatchToken = createToken(start, partLength, token);
358 longestMatchToken = createToken(start, partLength, token);
361 tokens.add(createToken(start, partLength, token));
363 } else if (dictionary.contains(lowerCaseTermBuffer, start,
365 // check the dictionary again with a word that is one character
367 // to avoid problems with genitive 's characters and other binding
369 if (this.onlyLongestMatch) {
370 if (longestMatchToken != null) {
371 if (longestMatchToken.length() < partLength - 1) {
372 longestMatchToken = createToken(start, partLength - 1, token);
375 longestMatchToken = createToken(start, partLength - 1, token);
378 tokens.add(createToken(start, partLength - 1, token));
382 if (this.onlyLongestMatch && longestMatchToken!=null) {
383 tokens.add(longestMatchToken);