1 package org.apache.lucene.analysis.compound;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.StringReader;
21 import org.xml.sax.InputSource;
23 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
24 import org.apache.lucene.analysis.MockTokenizer;
25 import org.apache.lucene.analysis.Tokenizer;
26 import org.apache.lucene.analysis.WhitespaceTokenizer;
27 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
28 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
30 public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
31 public void testHyphenationCompoundWordsDA() throws Exception {
32 String[] dict = { "læse", "hest" };
34 InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
35 HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
36 .getHyphenationTree(is);
38 HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
39 new MockTokenizer(new StringReader("min veninde som er lidt af en læsehest"), MockTokenizer.WHITESPACE, false),
41 dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
42 CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
43 CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
44 assertTokenStreamContents(tf,
45 new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" },
46 new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 }
50 public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
51 String[] dict = { "basketball", "basket", "ball", "kurv" };
53 InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
54 HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
55 .getHyphenationTree(is);
57 // the word basket will not be added due to the longest match option
58 HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
59 new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
61 CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
62 CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
63 assertTokenStreamContents(tf,
64 new String[] { "basketballkurv", "basketball", "ball", "kurv" },
65 new int[] { 1, 0, 0, 0 }
71 * With hyphenation-only, you can get a lot of nonsense tokens.
72 * This can be controlled with the min/max subword size.
74 public void testHyphenationOnly() throws Exception {
75 InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
76 HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
77 .getHyphenationTree(is);
79 HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
81 new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
83 CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
87 assertTokenStreamContents(tf,
88 new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" }
91 tf = new HyphenationCompoundWordTokenFilter(
93 new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
95 CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
99 assertTokenStreamContents(tf,
100 new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" }
103 tf = new HyphenationCompoundWordTokenFilter(
104 TEST_VERSION_CURRENT,
105 new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
107 CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
111 assertTokenStreamContents(tf,
112 new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket",
113 "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" }
118 public void testDumbCompoundWordsSE() throws Exception {
119 String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
120 "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
121 "Sko", "Vind", "Rute", "Torkare", "Blad" };
123 DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
126 "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba"),
127 MockTokenizer.WHITESPACE, false),
130 assertTokenStreamContents(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor",
131 "Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr",
132 "Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr",
133 "Glasögonfodral", "Glas", "ögon", "fodral", "Basfiolsfodral", "Bas",
134 "fiol", "fodral", "Basfiolsfodralmakaregesäll", "Bas", "fiol",
135 "fodral", "makare", "gesäll", "Skomakare", "Sko", "makare",
136 "Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad",
137 "Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 3, 8, 8, 11, 17,
138 17, 20, 24, 24, 28, 33, 33, 39, 44, 44, 49, 54, 54, 58, 62, 69, 69, 72,
139 77, 84, 84, 87, 92, 98, 104, 111, 111, 114, 121, 121, 125, 129, 137,
140 137, 141, 151, 156 }, new int[] { 7, 3, 7, 16, 11, 16, 23, 20, 23, 32,
141 28, 32, 43, 39, 43, 53, 49, 53, 68, 58, 62, 68, 83, 72, 76, 83, 110,
142 87, 91, 98, 104, 110, 120, 114, 120, 136, 125, 129, 136, 155, 141, 145,
143 155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
144 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
148 public void testDumbCompoundWordsSELongestMatch() throws Exception {
149 String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
150 "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll",
151 "Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" };
153 DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
154 new MockTokenizer(new StringReader("Basfiolsfodralmakaregesäll"), MockTokenizer.WHITESPACE, false),
155 dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
156 CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
157 CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);
159 assertTokenStreamContents(tf, new String[] { "Basfiolsfodralmakaregesäll", "Bas",
160 "fiolsfodral", "fodral", "makare", "gesäll" }, new int[] { 0, 0, 3, 8,
161 14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0,
165 public void testReset() throws Exception {
166 String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
167 "Aufgabe", "Überwachung" };
169 Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
170 "Rindfleischüberwachungsgesetz"));
171 DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
173 CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
174 CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
175 CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
177 CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
178 assertTrue(tf.incrementToken());
179 assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
180 assertTrue(tf.incrementToken());
181 assertEquals("Rind", termAtt.toString());
182 wsTokenizer.reset(new StringReader("Rindfleischüberwachungsgesetz"));
184 assertTrue(tf.incrementToken());
185 assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());