1 package org.apache.lucene.analysis.compound;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.StringReader;
23 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
24 import org.apache.lucene.analysis.MockTokenizer;
25 import org.apache.lucene.analysis.TokenFilter;
26 import org.apache.lucene.analysis.TokenStream;
27 import org.apache.lucene.analysis.Tokenizer;
28 import org.apache.lucene.analysis.WhitespaceTokenizer;
29 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
30 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
31 import org.apache.lucene.util.Attribute;
32 import org.apache.lucene.util.AttributeImpl;
33 import org.xml.sax.InputSource;
35 public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
36 public void testHyphenationCompoundWordsDA() throws Exception {
37 String[] dict = { "læse", "hest" };
39 InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
40 HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
41 .getHyphenationTree(is);
43 HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
44 new MockTokenizer(new StringReader("min veninde som er lidt af en læsehest"), MockTokenizer.WHITESPACE, false),
46 dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
47 CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
48 CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
49 assertTokenStreamContents(tf,
50 new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" },
51 new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 }
55 public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
56 String[] dict = { "basketball", "basket", "ball", "kurv" };
58 InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
59 HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
60 .getHyphenationTree(is);
62 // the word basket will not be added due to the longest match option
63 HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
64 new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
66 CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
67 CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
68 assertTokenStreamContents(tf,
69 new String[] { "basketballkurv", "basketball", "ball", "kurv" },
70 new int[] { 1, 0, 0, 0 }
76 * With hyphenation-only, you can get a lot of nonsense tokens.
77 * This can be controlled with the min/max subword size.
79 public void testHyphenationOnly() throws Exception {
80 InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
81 HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
82 .getHyphenationTree(is);
84 HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
86 new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
88 CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
92 assertTokenStreamContents(tf,
93 new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" }
96 tf = new HyphenationCompoundWordTokenFilter(
98 new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
100 CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
104 assertTokenStreamContents(tf,
105 new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" }
108 tf = new HyphenationCompoundWordTokenFilter(
109 TEST_VERSION_CURRENT,
110 new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
112 CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
116 assertTokenStreamContents(tf,
117 new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket",
118 "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" }
123 public void testDumbCompoundWordsSE() throws Exception {
124 String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
125 "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
126 "Sko", "Vind", "Rute", "Torkare", "Blad" };
128 DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
131 "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba"),
132 MockTokenizer.WHITESPACE, false),
135 assertTokenStreamContents(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor",
136 "Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr",
137 "Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr",
138 "Glasögonfodral", "Glas", "ögon", "fodral", "Basfiolsfodral", "Bas",
139 "fiol", "fodral", "Basfiolsfodralmakaregesäll", "Bas", "fiol",
140 "fodral", "makare", "gesäll", "Skomakare", "Sko", "makare",
141 "Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad",
142 "Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 3, 8, 8, 11, 17,
143 17, 20, 24, 24, 28, 33, 33, 39, 44, 44, 49, 54, 54, 58, 62, 69, 69, 72,
144 77, 84, 84, 87, 92, 98, 104, 111, 111, 114, 121, 121, 125, 129, 137,
145 137, 141, 151, 156 }, new int[] { 7, 3, 7, 16, 11, 16, 23, 20, 23, 32,
146 28, 32, 43, 39, 43, 53, 49, 53, 68, 58, 62, 68, 83, 72, 76, 83, 110,
147 87, 91, 98, 104, 110, 120, 114, 120, 136, 125, 129, 136, 155, 141, 145,
148 155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
149 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
153 public void testDumbCompoundWordsSELongestMatch() throws Exception {
154 String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
155 "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll",
156 "Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" };
158 DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
159 new MockTokenizer(new StringReader("Basfiolsfodralmakaregesäll"), MockTokenizer.WHITESPACE, false),
160 dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
161 CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
162 CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);
164 assertTokenStreamContents(tf, new String[] { "Basfiolsfodralmakaregesäll", "Bas",
165 "fiolsfodral", "fodral", "makare", "gesäll" }, new int[] { 0, 0, 3, 8,
166 14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0,
170 public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
171 String[] dict = {"ab", "cd", "ef"};
173 DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
174 new WhitespaceTokenizer(TEST_VERSION_CURRENT,
179 CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
180 CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
181 CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
183 assertTokenStreamContents(tf,
184 new String[] { "abcdef", "ab", "cd", "ef" },
185 new int[] { 0, 0, 2, 4},
186 new int[] { 6, 2, 4, 6},
187 new int[] { 1, 0, 0, 0}
191 public void testWordComponentWithLessThanMinimumLength() throws Exception {
192 String[] dict = {"abc", "d", "efg"};
194 DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
195 new WhitespaceTokenizer(TEST_VERSION_CURRENT,
200 CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
201 CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
202 CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
204 // since "d" is shorter than the minimum subword size, it should not be added to the token stream
205 assertTokenStreamContents(tf,
206 new String[] { "abcdefg", "abc", "efg" },
207 new int[] { 0, 0, 4},
208 new int[] { 7, 3, 7},
213 public void testReset() throws Exception {
214 String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
215 "Aufgabe", "Überwachung" };
217 Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
218 "Rindfleischüberwachungsgesetz"));
219 DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
221 CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
222 CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
223 CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
225 CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
226 assertTrue(tf.incrementToken());
227 assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
228 assertTrue(tf.incrementToken());
229 assertEquals("Rind", termAtt.toString());
230 wsTokenizer.reset(new StringReader("Rindfleischüberwachungsgesetz"));
232 assertTrue(tf.incrementToken());
233 assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
236 public void testRetainMockAttribute() throws Exception {
237 String[] dict = { "abc", "d", "efg" };
238 Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
239 new StringReader("abcdefg"));
240 TokenStream stream = new MockRetainAttributeFilter(tokenizer);
241 stream = new DictionaryCompoundWordTokenFilter(
242 TEST_VERSION_CURRENT, stream, dict,
243 CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
244 CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
245 CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
246 MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
247 while (stream.incrementToken()) {
248 assertTrue("Custom attribute value was lost", retAtt.getRetain());
253 public static interface MockRetainAttribute extends Attribute {
254 void setRetain(boolean attr);
258 public static final class MockRetainAttributeImpl extends AttributeImpl implements MockRetainAttribute {
259 private boolean retain = false;
261 public void clear() {
264 public boolean getRetain() {
267 public void setRetain(boolean retain) {
268 this.retain = retain;
271 public void copyTo(AttributeImpl target) {
272 MockRetainAttribute t = (MockRetainAttribute) target;
277 private static class MockRetainAttributeFilter extends TokenFilter {
279 MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class);
281 MockRetainAttributeFilter(TokenStream input) {
286 public boolean incrementToken() throws IOException {
287 if (input.incrementToken()){
288 retainAtt.setRetain(true);