2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.apache.lucene.analysis.synonym;
20 import java.io.Reader;
21 import java.io.StringReader;
22 import java.util.ArrayList;
23 import java.util.Arrays;
24 import java.util.HashMap;
25 import java.util.HashSet;
26 import java.util.List;
30 import org.apache.lucene.analysis.Analyzer;
31 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
32 import org.apache.lucene.analysis.MockAnalyzer;
33 import org.apache.lucene.analysis.MockTokenizer;
34 import org.apache.lucene.analysis.Tokenizer;
35 import org.apache.lucene.analysis.tokenattributes.*;
36 import org.apache.lucene.analysis.ReusableAnalyzerBase;
37 import org.apache.lucene.util.CharsRef;
38 import org.apache.lucene.util._TestUtil;
40 public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
42 private SynonymMap.Builder b;
43 private Tokenizer tokensIn;
44 private SynonymFilter tokensOut;
45 private CharTermAttribute termAtt;
46 private PositionIncrementAttribute posIncrAtt;
47 private OffsetAttribute offsetAtt;
49 private void add(String input, String output, boolean keepOrig) {
50 b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
51 new CharsRef(output.replaceAll(" +", "\u0000")),
55 private void assertEquals(CharTermAttribute term, String expected) {
56 assertEquals(expected.length(), term.length());
57 final char[] buffer = term.buffer();
58 for(int chIDX=0;chIDX<expected.length();chIDX++) {
59 assertEquals(expected.charAt(chIDX), buffer[chIDX]);
63 // todo: we should probably refactor this guy to use/take analyzer,
64 // the tests are a little messy
65 private void verify(String input, String output) throws Exception {
67 System.out.println("TEST: verify input=" + input + " expectedOutput=" + output);
70 tokensIn.reset(new StringReader(input));
72 final String[] expected = output.split(" ");
74 while(tokensOut.incrementToken()) {
77 System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
80 assertTrue(expectedUpto < expected.length);
81 final int startOffset = offsetAtt.startOffset();
82 final int endOffset = offsetAtt.endOffset();
84 final String[] expectedAtPos = expected[expectedUpto++].split("/");
85 for(int atPos=0;atPos<expectedAtPos.length;atPos++) {
87 assertTrue(tokensOut.incrementToken());
89 System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
92 assertEquals(termAtt, expectedAtPos[atPos]);
93 assertEquals(atPos == 0 ? 1 : 0,
94 posIncrAtt.getPositionIncrement());
95 // start/end offset of all tokens at same pos should
97 assertEquals(startOffset, offsetAtt.startOffset());
98 assertEquals(endOffset, offsetAtt.endOffset());
104 System.out.println(" incr: END");
106 assertEquals(expectedUpto, expected.length);
109 public void testBasic() throws Exception {
110 b = new SynonymMap.Builder(true);
111 add("a", "foo", true);
112 add("a b", "bar fee", true);
113 add("b c", "dog collar", true);
114 add("c d", "dog harness holder extras", true);
115 add("m c e", "dog barks loudly", false);
117 add("e f", "foo bar", false);
118 add("e f", "baz bee", false);
120 add("z", "boo", false);
121 add("y", "bee", true);
123 tokensIn = new MockTokenizer(new StringReader("a"),
124 MockTokenizer.WHITESPACE,
127 assertTrue(tokensIn.incrementToken());
128 assertFalse(tokensIn.incrementToken());
132 tokensOut = new SynonymFilter(tokensIn,
135 termAtt = tokensOut.addAttribute(CharTermAttribute.class);
136 posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
137 offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
139 verify("a b c", "a/bar b/fee c");
141 // syn output extends beyond input tokens
142 verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras");
144 verify("a b a", "a/bar b/fee a/foo");
146 // outputs that add to one another:
147 verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras");
149 // two outputs for same input
150 verify("e f", "foo/baz bar/bee");
152 // mixed keepOrig true/false:
153 verify("a m c e x", "a/foo dog barks loudly x");
154 verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x");
155 assertTrue(tokensOut.getCaptureCount() > 0);
157 // no captureStates when no syns matched
158 verify("p q r s t", "p q r s t");
159 assertEquals(0, tokensOut.getCaptureCount());
161 // no captureStates when only single-input syns, w/ no
162 // lookahead needed, matched
163 verify("p q z y t", "p q boo y/bee t");
164 assertEquals(0, tokensOut.getCaptureCount());
167 private String getRandomString(char start, int alphabetSize, int length) {
168 assert alphabetSize <= 26;
169 char[] s = new char[2*length];
170 for(int charIDX=0;charIDX<length;charIDX++) {
171 s[2*charIDX] = (char) (start + random.nextInt(alphabetSize));
172 s[2*charIDX+1] = ' ';
174 return new String(s);
177 private static class OneSyn {
183 public String slowSynMatcher(String doc, List<OneSyn> syns, int maxOutputLength) {
184 assertTrue(doc.length() % 2 == 0);
185 final int numInputs = doc.length()/2;
186 boolean[] keepOrigs = new boolean[numInputs];
187 boolean[] hasMatch = new boolean[numInputs];
188 Arrays.fill(keepOrigs, false);
189 String[] outputs = new String[numInputs + maxOutputLength];
190 OneSyn[] matches = new OneSyn[numInputs];
191 for(OneSyn syn : syns) {
194 idx = doc.indexOf(syn.in, 1+idx);
198 assertTrue(idx % 2 == 0);
199 final int matchIDX = idx/2;
200 assertTrue(syn.in.length() % 2 == 1);
201 if (matches[matchIDX] == null) {
202 matches[matchIDX] = syn;
203 } else if (syn.in.length() > matches[matchIDX].in.length()) {
204 // Greedy conflict resolution: longer match wins:
205 matches[matchIDX] = syn;
207 assertTrue(syn.in.length() < matches[matchIDX].in.length());
212 // Greedy conflict resolution: if syn matches a range of inputs,
213 // it prevents other syns from matching that range
214 for(int inputIDX=0;inputIDX<numInputs;inputIDX++) {
215 final OneSyn match = matches[inputIDX];
217 final int synInLength = (1+match.in.length())/2;
218 for(int nextInputIDX=inputIDX+1;nextInputIDX<numInputs && nextInputIDX<(inputIDX+synInLength);nextInputIDX++) {
219 matches[nextInputIDX] = null;
224 // Fill overlapping outputs:
225 for(int inputIDX=0;inputIDX<numInputs;inputIDX++) {
226 final OneSyn syn = matches[inputIDX];
230 for(int idx=0;idx<(1+syn.in.length())/2;idx++) {
231 hasMatch[inputIDX+idx] = true;
232 keepOrigs[inputIDX+idx] |= syn.keepOrig;
234 for(String synOut : syn.out) {
235 final String[] synOutputs = synOut.split(" ");
236 assertEquals(synOutputs.length, (1+synOut.length())/2);
237 final int matchEnd = inputIDX + synOutputs.length;
239 for(int matchIDX=inputIDX;matchIDX<matchEnd;matchIDX++) {
240 if (outputs[matchIDX] == null) {
241 outputs[matchIDX] = synOutputs[synUpto++];
243 outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++];
249 StringBuilder sb = new StringBuilder();
250 String[] inputTokens = doc.split(" ");
251 final int limit = inputTokens.length + maxOutputLength;
252 for(int inputIDX=0;inputIDX<limit;inputIDX++) {
253 boolean posHasOutput = false;
254 if (inputIDX >= numInputs && outputs[inputIDX] == null) {
257 if (inputIDX < numInputs && (!hasMatch[inputIDX] || keepOrigs[inputIDX])) {
258 assertTrue(inputTokens[inputIDX].length() != 0);
259 sb.append(inputTokens[inputIDX]);
263 if (outputs[inputIDX] != null) {
267 sb.append(outputs[inputIDX]);
268 } else if (!posHasOutput) {
271 if (inputIDX < limit-1) {
276 return sb.toString();
279 public void testRandom() throws Exception {
281 final int alphabetSize = _TestUtil.nextInt(random, 2, 7);
283 final int docLen = atLeast(3000);
284 //final int docLen = 50;
286 final String document = getRandomString('a', alphabetSize, docLen);
289 System.out.println("TEST: doc=" + document);
292 final int numSyn = atLeast(5);
293 //final int numSyn = 2;
295 final Map<String,OneSyn> synMap = new HashMap<String,OneSyn>();
296 final List<OneSyn> syns = new ArrayList<OneSyn>();
297 final boolean dedup = random.nextBoolean();
299 System.out.println(" dedup=" + dedup);
301 b = new SynonymMap.Builder(dedup);
302 for(int synIDX=0;synIDX<numSyn;synIDX++) {
303 final String synIn = getRandomString('a', alphabetSize, _TestUtil.nextInt(random, 1, 5)).trim();
304 OneSyn s = synMap.get(synIn);
309 s.out = new ArrayList<String>();
310 synMap.put(synIn, s);
311 s.keepOrig = random.nextBoolean();
313 final String synOut = getRandomString('0', 10, _TestUtil.nextInt(random, 1, 5)).trim();
315 add(synIn, synOut, s.keepOrig);
317 System.out.println(" syns[" + synIDX + "] = " + s.in + " -> " + s.out + " keepOrig=" + s.keepOrig);
321 tokensIn = new MockTokenizer(new StringReader("a"),
322 MockTokenizer.WHITESPACE,
325 assertTrue(tokensIn.incrementToken());
326 assertFalse(tokensIn.incrementToken());
330 tokensOut = new SynonymFilter(tokensIn,
333 termAtt = tokensOut.addAttribute(CharTermAttribute.class);
334 posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
335 offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
341 final String expected = slowSynMatcher(document, syns, 5);
344 System.out.println("TEST: expected=" + expected);
347 verify(document, expected);
350 private void pruneDups(List<OneSyn> syns) {
351 Set<String> seen = new HashSet<String>();
352 for(OneSyn syn : syns) {
354 while(idx < syn.out.size()) {
355 String out = syn.out.get(idx);
356 if (!seen.contains(out)) {
367 private String randomNonEmptyString() {
369 final String s = _TestUtil.randomUnicodeString(random).trim();
370 if (s.length() != 0 && s.indexOf('\u0000') == -1) {
376 /** simple random test, doesn't verify correctness.
377 * does verify it doesnt throw exceptions, or that the stream doesn't misbehave
379 public void testRandom2() throws Exception {
380 final int numIters = atLeast(10);
381 for (int i = 0; i < numIters; i++) {
382 b = new SynonymMap.Builder(random.nextBoolean());
383 final int numEntries = atLeast(10);
384 for (int j = 0; j < numEntries; j++) {
385 add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
387 final SynonymMap map = b.build();
388 final boolean ignoreCase = random.nextBoolean();
390 final Analyzer analyzer = new ReusableAnalyzerBase() {
392 protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
393 Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
394 return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase));
398 checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
403 public void testVanishingTerms() throws Exception {
405 "aaa => aaaa1 aaaa2 aaaa3\n" +
406 "bbb => bbbb1 bbbb2\n";
408 SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random));
409 parser.add(new StringReader(testFile));
410 final SynonymMap map = parser.build();
412 Analyzer analyzer = new ReusableAnalyzerBase() {
414 protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
415 Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
416 return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
420 // where did my pot go?!
421 assertAnalyzesTo(analyzer, "xyzzy bbb pot of gold",
422 new String[] { "xyzzy", "bbbb1", "pot", "bbbb2", "of", "gold" });
424 // this one nukes 'pot' and 'of'
425 // xyzzy aaa pot of gold -> xyzzy aaaa1 aaaa2 aaaa3 gold
426 assertAnalyzesTo(analyzer, "xyzzy aaa pot of gold",
427 new String[] { "xyzzy", "aaaa1", "pot", "aaaa2", "of", "aaaa3", "gold" });
430 public void testBasic2() throws Exception {
431 b = new SynonymMap.Builder(true);
432 final boolean keepOrig = false;
433 add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig);
434 add("bbb", "bbbb1 bbbb2", keepOrig);
435 tokensIn = new MockTokenizer(new StringReader("a"),
436 MockTokenizer.WHITESPACE,
439 assertTrue(tokensIn.incrementToken());
440 assertFalse(tokensIn.incrementToken());
444 tokensOut = new SynonymFilter(tokensIn,
447 termAtt = tokensOut.addAttribute(CharTermAttribute.class);
448 posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
449 offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
452 verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold");
453 verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold");
455 verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold");
456 verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold");
460 public void testMatching() throws Exception {
461 b = new SynonymMap.Builder(true);
462 final boolean keepOrig = false;
463 add("a b", "ab", keepOrig);
464 add("a c", "ac", keepOrig);
465 add("a", "aa", keepOrig);
466 add("b", "bb", keepOrig);
467 add("z x c v", "zxcv", keepOrig);
468 add("x c", "xc", keepOrig);
469 final SynonymMap map = b.build();
470 Analyzer a = new ReusableAnalyzerBase() {
472 protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
473 Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
474 return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
478 checkOneTerm(a, "$", "$");
479 checkOneTerm(a, "a", "aa");
480 checkOneTerm(a, "b", "bb");
482 assertAnalyzesTo(a, "a $",
483 new String[] { "aa", "$" },
486 assertAnalyzesTo(a, "$ a",
487 new String[] { "$", "aa" },
490 assertAnalyzesTo(a, "a a",
491 new String[] { "aa", "aa" },
494 assertAnalyzesTo(a, "z x c v",
495 new String[] { "zxcv" },
498 assertAnalyzesTo(a, "z x c $",
499 new String[] { "z", "xc", "$" },
500 new int[] { 1, 1, 1 });
503 public void testRepeatsOff() throws Exception {
504 b = new SynonymMap.Builder(true);
505 final boolean keepOrig = false;
506 add("a b", "ab", keepOrig);
507 add("a b", "ab", keepOrig);
508 add("a b", "ab", keepOrig);
509 final SynonymMap map = b.build();
510 Analyzer a = new ReusableAnalyzerBase() {
512 protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
513 Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
514 return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
518 assertAnalyzesTo(a, "a b",
519 new String[] { "ab" },
523 public void testRepeatsOn() throws Exception {
524 b = new SynonymMap.Builder(false);
525 final boolean keepOrig = false;
526 add("a b", "ab", keepOrig);
527 add("a b", "ab", keepOrig);
528 add("a b", "ab", keepOrig);
529 final SynonymMap map = b.build();
530 Analyzer a = new ReusableAnalyzerBase() {
532 protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
533 Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
534 return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
538 assertAnalyzesTo(a, "a b",
539 new String[] { "ab", "ab", "ab" },
540 new int[] { 1, 0, 0 });
543 public void testRecursion() throws Exception {
544 b = new SynonymMap.Builder(true);
545 final boolean keepOrig = false;
546 add("zoo", "zoo", keepOrig);
547 final SynonymMap map = b.build();
548 Analyzer a = new ReusableAnalyzerBase() {
550 protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
551 Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
552 return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
556 assertAnalyzesTo(a, "zoo zoo $ zoo",
557 new String[] { "zoo", "zoo", "$", "zoo" },
558 new int[] { 1, 1, 1, 1 });
561 public void testRecursion2() throws Exception {
562 b = new SynonymMap.Builder(true);
563 final boolean keepOrig = false;
564 add("zoo", "zoo", keepOrig);
565 add("zoo", "zoo zoo", keepOrig);
566 final SynonymMap map = b.build();
567 Analyzer a = new ReusableAnalyzerBase() {
569 protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
570 Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
571 return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
575 // verify("zoo zoo $ zoo", "zoo/zoo zoo/zoo/zoo $/zoo zoo/zoo zoo");
576 assertAnalyzesTo(a, "zoo zoo $ zoo",
577 new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo", "zoo" },
578 new int[] { 1, 0, 1, 0, 0, 1, 0, 1, 0, 1 });
581 public void testIncludeOrig() throws Exception {
582 b = new SynonymMap.Builder(true);
583 final boolean keepOrig = true;
584 add("a b", "ab", keepOrig);
585 add("a c", "ac", keepOrig);
586 add("a", "aa", keepOrig);
587 add("b", "bb", keepOrig);
588 add("z x c v", "zxcv", keepOrig);
589 add("x c", "xc", keepOrig);
590 final SynonymMap map = b.build();
591 Analyzer a = new ReusableAnalyzerBase() {
593 protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
594 Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
595 return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
599 assertAnalyzesTo(a, "$",
600 new String[] { "$" },
602 assertAnalyzesTo(a, "a",
603 new String[] { "a", "aa" },
605 assertAnalyzesTo(a, "a",
606 new String[] { "a", "aa" },
608 assertAnalyzesTo(a, "$ a",
609 new String[] { "$", "a", "aa" },
610 new int[] { 1, 1, 0 });
611 assertAnalyzesTo(a, "a $",
612 new String[] { "a", "aa", "$" },
613 new int[] { 1, 0, 1 });
614 assertAnalyzesTo(a, "$ a !",
615 new String[] { "$", "a", "aa", "!" },
616 new int[] { 1, 1, 0, 1 });
617 assertAnalyzesTo(a, "a a",
618 new String[] { "a", "aa", "a", "aa" },
619 new int[] { 1, 0, 1, 0 });
620 assertAnalyzesTo(a, "b",
621 new String[] { "b", "bb" },
623 assertAnalyzesTo(a, "z x c v",
624 new String[] { "z", "zxcv", "x", "c", "v" },
625 new int[] { 1, 0, 1, 1, 1 });
626 assertAnalyzesTo(a, "z x c $",
627 new String[] { "z", "x", "xc", "c", "$" },
628 new int[] { 1, 1, 0, 1, 1 });
631 public void testRecursion3() throws Exception {
632 b = new SynonymMap.Builder(true);
633 final boolean keepOrig = true;
634 add("zoo zoo", "zoo", keepOrig);
635 final SynonymMap map = b.build();
636 Analyzer a = new ReusableAnalyzerBase() {
638 protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
639 Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
640 return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
644 assertAnalyzesTo(a, "zoo zoo $ zoo",
645 new String[] { "zoo", "zoo", "zoo", "$", "zoo" },
646 new int[] { 1, 0, 1, 1, 1 });
649 public void testRecursion4() throws Exception {
650 b = new SynonymMap.Builder(true);
651 final boolean keepOrig = true;
652 add("zoo zoo", "zoo", keepOrig);
653 add("zoo", "zoo zoo", keepOrig);
654 final SynonymMap map = b.build();
655 Analyzer a = new ReusableAnalyzerBase() {
657 protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
658 Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
659 return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
663 assertAnalyzesTo(a, "zoo zoo $ zoo",
664 new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
665 new int[] { 1, 0, 1, 1, 1, 0, 1 });