1 package org.apache.lucene.analysis.compound;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import java.io.Reader;
24 import org.apache.lucene.analysis.TokenFilter; // for javadocs
25 import org.apache.lucene.analysis.TokenStream;
26 import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
27 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
28 import org.apache.lucene.util.Version;
29 import org.xml.sax.InputSource;
32 * A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
34 * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
35 * "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
36 * grammar and a word dictionary to achieve this.
38 * You must specify the required {@link Version} compatibility when creating
39 * CompoundWordTokenFilterBase:
41 * <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
42 * supplementary characters in strings and char arrays provided as compound word
45 * <p>If you pass in a {@link org.apache.lucene.analysis.CharArraySet} as dictionary,
46 * it should be case-insensitive unless it contains only lowercased entries and you
47 * have {@link org.apache.lucene.analysis.LowerCaseFilter} before this filter in your analysis chain.
48 * For optional performance (as this filter does lots of lookups to the dictionary,
49 * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
50 * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
51 * transformed to case-insensitive!
53 public class HyphenationCompoundWordTokenFilter extends
54 CompoundWordTokenFilterBase {
55 private HyphenationTree hyphenator;
58 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
61 * Lucene version to enable correct Unicode 4.0 behavior in the
62 * dictionaries if Version > 3.0. See <a
63 * href="CompoundWordTokenFilterBase#version"
64 * >CompoundWordTokenFilterBase</a> for details.
66 * the {@link TokenStream} to process
68 * the hyphenation pattern tree to use for hyphenation
70 * the word dictionary to match against
72 * only words longer than this get processed
73 * @param minSubwordSize
74 * only subwords longer than this get to the output stream
75 * @param maxSubwordSize
76 * only subwords shorter than this get to the output stream
77 * @param onlyLongestMatch
78 * Add only the longest matching subword to the stream
79 * @deprecated Use the constructors taking {@link Set}
82 public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
83 HyphenationTree hyphenator, String[] dictionary, int minWordSize,
84 int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
85 this(matchVersion, input, hyphenator, makeDictionary(matchVersion, dictionary), minWordSize,
86 minSubwordSize, maxSubwordSize, onlyLongestMatch);
90 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
93 * Lucene version to enable correct Unicode 4.0 behavior in the
94 * dictionaries if Version > 3.0. See <a
95 * href="CompoundWordTokenFilterBase#version"
96 * >CompoundWordTokenFilterBase</a> for details.
98 * the {@link TokenStream} to process
100 * the hyphenation pattern tree to use for hyphenation
102 * the word dictionary to match against
103 * @deprecated Use the constructors taking {@link Set}
106 public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
107 HyphenationTree hyphenator, String[] dictionary) {
108 this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30,dictionary), DEFAULT_MIN_WORD_SIZE,
109 DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
113 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
115 * @param matchVersion
116 * Lucene version to enable correct Unicode 4.0 behavior in the
117 * dictionaries if Version > 3.0. See <a
118 * href="CompoundWordTokenFilterBase#version"
119 * >CompoundWordTokenFilterBase</a> for details.
121 * the {@link TokenStream} to process
123 * the hyphenation pattern tree to use for hyphenation
125 * the word dictionary to match against.
127 public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
128 HyphenationTree hyphenator, Set<?> dictionary) {
129 this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
130 DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
134 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
136 * @param matchVersion
137 * Lucene version to enable correct Unicode 4.0 behavior in the
138 * dictionaries if Version > 3.0. See <a
139 * href="CompoundWordTokenFilterBase#version"
140 * >CompoundWordTokenFilterBase</a> for details.
142 * the {@link TokenStream} to process
144 * the hyphenation pattern tree to use for hyphenation
146 * the word dictionary to match against.
148 * only words longer than this get processed
149 * @param minSubwordSize
150 * only subwords longer than this get to the output stream
151 * @param maxSubwordSize
152 * only subwords shorter than this get to the output stream
153 * @param onlyLongestMatch
154 * Add only the longest matching subword to the stream
156 public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
157 HyphenationTree hyphenator, Set<?> dictionary, int minWordSize,
158 int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
159 super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
162 this.hyphenator = hyphenator;
166 * Create a HyphenationCompoundWordTokenFilter with no dictionary.
168 * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)
169 * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
170 * null, minWordSize, minSubwordSize, maxSubwordSize }
172 public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
173 HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
174 int maxSubwordSize) {
175 this(matchVersion, input, hyphenator, (Set<?>) null, minWordSize, minSubwordSize,
176 maxSubwordSize, false);
180 * Create a HyphenationCompoundWordTokenFilter with no dictionary.
182 * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, int, int, int)
183 * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
184 * DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
186 public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
187 HyphenationTree hyphenator) {
188 this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE,
189 DEFAULT_MAX_SUBWORD_SIZE);
193 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
195 * @param input the {@link TokenStream} to process
196 * @param hyphenator the hyphenation pattern tree to use for hyphenation
197 * @param dictionary the word dictionary to match against
198 * @param minWordSize only words longer than this get processed
199 * @param minSubwordSize only subwords longer than this get to the output
201 * @param maxSubwordSize only subwords shorter than this get to the output
203 * @param onlyLongestMatch Add only the longest matching subword to the stream
204 * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, String[], int, int, int, boolean)} instead.
207 public HyphenationCompoundWordTokenFilter(TokenStream input,
208 HyphenationTree hyphenator, String[] dictionary, int minWordSize,
209 int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
210 this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30, dictionary), minWordSize,
211 minSubwordSize, maxSubwordSize, onlyLongestMatch);
215 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
217 * @param input the {@link TokenStream} to process
218 * @param hyphenator the hyphenation pattern tree to use for hyphenation
219 * @param dictionary the word dictionary to match against
220 * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, String[])} instead.
223 public HyphenationCompoundWordTokenFilter(TokenStream input,
224 HyphenationTree hyphenator, String[] dictionary) {
225 this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30, dictionary), DEFAULT_MIN_WORD_SIZE,
226 DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
230 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
232 * @param input the {@link TokenStream} to process
233 * @param hyphenator the hyphenation pattern tree to use for hyphenation
234 * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
235 * lower case strings.
236 * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set)} instead.
239 public HyphenationCompoundWordTokenFilter(TokenStream input,
240 HyphenationTree hyphenator, Set<?> dictionary) {
241 this(Version.LUCENE_30, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
242 DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
246 * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
248 * @param input the {@link TokenStream} to process
249 * @param hyphenator the hyphenation pattern tree to use for hyphenation
250 * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
251 * lower case strings.
252 * @param minWordSize only words longer than this get processed
253 * @param minSubwordSize only subwords longer than this get to the output
255 * @param maxSubwordSize only subwords shorter than this get to the output
257 * @param onlyLongestMatch Add only the longest matching subword to the stream
258 * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)} instead.
261 public HyphenationCompoundWordTokenFilter(TokenStream input,
262 HyphenationTree hyphenator, Set<?> dictionary, int minWordSize,
263 int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
264 super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
267 this.hyphenator = hyphenator;
271 * Create a hyphenator tree
273 * @param hyphenationFilename the filename of the XML grammar to load
274 * @return An object representing the hyphenation patterns
277 public static HyphenationTree getHyphenationTree(String hyphenationFilename)
279 return getHyphenationTree(new InputSource(hyphenationFilename));
283 * Create a hyphenator tree
285 * @param hyphenationFile the file of the XML grammar to load
286 * @return An object representing the hyphenation patterns
289 public static HyphenationTree getHyphenationTree(File hyphenationFile)
291 return getHyphenationTree(new InputSource(hyphenationFile.toURL().toExternalForm()));
295 * Create a hyphenator tree
297 * @param hyphenationReader the reader of the XML grammar to load from
298 * @return An object representing the hyphenation patterns
300 * @deprecated Don't use Readers with fixed charset to load XML files, unless programatically created.
301 * Use {@link #getHyphenationTree(InputSource)} instead, where you can supply default charset and input
302 * stream, if you like.
305 public static HyphenationTree getHyphenationTree(Reader hyphenationReader)
307 final InputSource is = new InputSource(hyphenationReader);
308 // we need this to load the DTD in very old parsers (like the one in JDK 1.4).
309 // The DTD itsself is provided via EntityResolver, so it should always load, but
310 // some parsers still want to have a base URL (Crimson).
311 is.setSystemId("urn:java:" + HyphenationTree.class.getName());
312 return getHyphenationTree(is);
316 * Create a hyphenator tree
318 * @param hyphenationSource the InputSource pointing to the XML grammar
319 * @return An object representing the hyphenation patterns
322 public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
324 HyphenationTree tree = new HyphenationTree();
325 tree.loadPatterns(hyphenationSource);
330 protected void decompose() {
331 // get the hyphenation points
332 Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
333 // No hyphen points found -> exit
334 if (hyphens == null) {
338 final int[] hyp = hyphens.getHyphenationPoints();
340 for (int i = 0; i < hyp.length; ++i) {
341 int remaining = hyp.length - i;
343 CompoundToken longestMatchToken = null;
344 for (int j = 1; j < remaining; j++) {
345 int partLength = hyp[i + j] - start;
347 // if the part is longer than maxSubwordSize we
348 // are done with this round
349 if (partLength > this.maxSubwordSize) {
353 // we only put subwords to the token stream
354 // that are longer than minPartSize
355 if (partLength < this.minSubwordSize) {
359 // check the dictionary
360 if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
361 if (this.onlyLongestMatch) {
362 if (longestMatchToken != null) {
363 if (longestMatchToken.txt.length() < partLength) {
364 longestMatchToken = new CompoundToken(start, partLength);
367 longestMatchToken = new CompoundToken(start, partLength);
370 tokens.add(new CompoundToken(start, partLength));
372 } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
373 // check the dictionary again with a word that is one character
375 // to avoid problems with genitive 's characters and other binding
377 if (this.onlyLongestMatch) {
378 if (longestMatchToken != null) {
379 if (longestMatchToken.txt.length() < partLength - 1) {
380 longestMatchToken = new CompoundToken(start, partLength - 1);
383 longestMatchToken = new CompoundToken(start, partLength - 1);
386 tokens.add(new CompoundToken(start, partLength - 1));
390 if (this.onlyLongestMatch && longestMatchToken!=null) {
391 tokens.add(longestMatchToken);