1 package org.apache.lucene.analysis.shingle;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.Reader;
21 import java.io.StringReader;
23 import org.apache.lucene.analysis.Analyzer;
24 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
25 import org.apache.lucene.analysis.MockAnalyzer;
26 import org.apache.lucene.analysis.MockTokenizer;
27 import org.apache.lucene.analysis.TokenStream;
28 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
29 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
30 import org.apache.lucene.document.Document;
31 import org.apache.lucene.document.Field;
32 import org.apache.lucene.index.IndexWriter;
33 import org.apache.lucene.index.IndexWriterConfig;
34 import org.apache.lucene.index.Term;
35 import org.apache.lucene.queryParser.QueryParser;
36 import org.apache.lucene.search.BooleanClause;
37 import org.apache.lucene.search.BooleanQuery;
38 import org.apache.lucene.search.IndexSearcher;
39 import org.apache.lucene.search.PhraseQuery;
40 import org.apache.lucene.search.Query;
41 import org.apache.lucene.search.ScoreDoc;
42 import org.apache.lucene.search.TermQuery;
43 import org.apache.lucene.store.Directory;
44 import org.apache.lucene.store.RAMDirectory;
47 * A test class for ShingleAnalyzerWrapper as regards queries and scoring.
49 public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
51 public IndexSearcher searcher;
54 * Set up a new index in RAM with three test phrases and the supplied Analyzer.
56 * @param analyzer the analyzer to use
57 * @return an indexSearcher on the test index.
58 * @throws Exception if an error occurs with index writer or searcher
60 public IndexSearcher setUpSearcher(Analyzer analyzer) throws Exception {
61 Directory dir = new RAMDirectory();
62 IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
66 doc.add(new Field("content", "please divide this sentence into shingles",
67 Field.Store.YES,Field.Index.ANALYZED));
68 writer.addDocument(doc);
71 doc.add(new Field("content", "just another test sentence",
72 Field.Store.YES,Field.Index.ANALYZED));
73 writer.addDocument(doc);
76 doc.add(new Field("content", "a sentence which contains no test",
77 Field.Store.YES,Field.Index.ANALYZED));
78 writer.addDocument(doc);
82 return new IndexSearcher(dir, true);
85 protected ScoreDoc[] queryParsingTest(Analyzer analyzer, String qs) throws Exception {
86 searcher = setUpSearcher(analyzer);
88 QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "content", analyzer);
90 Query q = qp.parse(qs);
92 return searcher.search(q, null, 1000).scoreDocs;
95 protected void compareRanks(ScoreDoc[] hits, int[] ranks) throws Exception {
96 assertEquals(ranks.length, hits.length);
97 for (int i = 0; i < ranks.length; i++) {
98 assertEquals(ranks[i], hits[i].doc);
103 * Will not work on an index without unigrams, since QueryParser automatically
104 * tokenizes on whitespace.
106 public void testShingleAnalyzerWrapperQueryParsing() throws Exception {
107 ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
108 (new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2),
110 int[] ranks = new int[] { 1, 2, 0 };
111 compareRanks(hits, ranks);
115 * This one fails with an exception.
117 public void testShingleAnalyzerWrapperPhraseQueryParsingFails() throws Exception {
118 ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
119 (new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2),
120 "\"this sentence\"");
121 int[] ranks = new int[] { 0 };
122 compareRanks(hits, ranks);
126 * This one works, actually.
128 public void testShingleAnalyzerWrapperPhraseQueryParsing() throws Exception {
129 ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
130 (new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2),
131 "\"test sentence\"");
132 int[] ranks = new int[] { 1 };
133 compareRanks(hits, ranks);
137 * Same as above, is tokenized without using the analyzer.
139 public void testShingleAnalyzerWrapperRequiredQueryParsing() throws Exception {
140 ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
141 (new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2),
143 int[] ranks = new int[] { 1, 2 };
144 compareRanks(hits, ranks);
148 * This shows how to construct a phrase query containing shingles.
150 public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
151 Analyzer analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2);
152 searcher = setUpSearcher(analyzer);
154 PhraseQuery q = new PhraseQuery();
156 TokenStream ts = analyzer.tokenStream("content",
157 new StringReader("this sentence"));
160 PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
161 CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
164 while (ts.incrementToken()) {
165 j += posIncrAtt.getPositionIncrement();
166 String termText = termAtt.toString();
167 q.add(new Term("content", termText), j);
170 ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
171 int[] ranks = new int[] { 0 };
172 compareRanks(hits, ranks);
176 * How to construct a boolean query with shingles. A query like this will
177 * implicitly score those documents higher that contain the words in the query
178 * in the right order and adjacent to each other.
180 public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
181 Analyzer analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2);
182 searcher = setUpSearcher(analyzer);
184 BooleanQuery q = new BooleanQuery();
186 TokenStream ts = analyzer.tokenStream("content",
187 new StringReader("test sentence"));
189 CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
193 while (ts.incrementToken()) {
194 String termText = termAtt.toString();
195 q.add(new TermQuery(new Term("content", termText)),
196 BooleanClause.Occur.SHOULD);
199 ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
200 int[] ranks = new int[] { 1, 2, 0 };
201 compareRanks(hits, ranks);
204 public void testReusableTokenStream() throws Exception {
205 Analyzer a = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2);
206 assertAnalyzesToReuse(a, "please divide into shingles",
207 new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
208 new int[] { 0, 0, 7, 7, 14, 14, 19 },
209 new int[] { 6, 13, 13, 18, 18, 27, 27 },
210 new int[] { 1, 0, 1, 0, 1, 0, 1 });
211 assertAnalyzesToReuse(a, "divide me up again",
212 new String[] { "divide", "divide me", "me", "me up", "up", "up again", "again" },
213 new int[] { 0, 0, 7, 7, 10, 10, 13 },
214 new int[] { 6, 9, 9, 12, 12, 18, 18 },
215 new int[] { 1, 0, 1, 0, 1, 0, 1 });
219 * analyzer that does not support reuse
220 * it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even.
222 private class NonreusableAnalyzer extends Analyzer {
223 int invocationCount = 0;
225 public TokenStream tokenStream(String fieldName, Reader reader) {
226 if (++invocationCount % 2 == 0)
227 return new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
229 return new MockTokenizer(reader, MockTokenizer.SIMPLE, false);
233 public void testWrappedAnalyzerDoesNotReuse() throws Exception {
234 Analyzer a = new ShingleAnalyzerWrapper(new NonreusableAnalyzer());
235 assertAnalyzesToReuse(a, "please divide into shingles.",
236 new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
237 new int[] { 0, 0, 7, 7, 14, 14, 19 },
238 new int[] { 6, 13, 13, 18, 18, 27, 27 },
239 new int[] { 1, 0, 1, 0, 1, 0, 1 });
240 assertAnalyzesToReuse(a, "please divide into shingles.",
241 new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles.", "shingles." },
242 new int[] { 0, 0, 7, 7, 14, 14, 19 },
243 new int[] { 6, 13, 13, 18, 18, 28, 28 },
244 new int[] { 1, 0, 1, 0, 1, 0, 1 });
245 assertAnalyzesToReuse(a, "please divide into shingles.",
246 new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
247 new int[] { 0, 0, 7, 7, 14, 14, 19 },
248 new int[] { 6, 13, 13, 18, 18, 27, 27 },
249 new int[] { 1, 0, 1, 0, 1, 0, 1 });
252 public void testNonDefaultMinShingleSize() throws Exception {
253 ShingleAnalyzerWrapper analyzer
254 = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 4);
255 assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
256 new String[] { "please", "please divide this", "please divide this sentence",
257 "divide", "divide this sentence", "divide this sentence into",
258 "this", "this sentence into", "this sentence into shingles",
259 "sentence", "sentence into shingles",
262 new int[] { 0, 0, 0, 7, 7, 7, 14, 14, 14, 19, 19, 28, 33 },
263 new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 },
264 new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });
265 analyzer.setOutputUnigrams(false);
266 assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
267 new String[] { "please divide this", "please divide this sentence",
268 "divide this sentence", "divide this sentence into",
269 "this sentence into", "this sentence into shingles",
270 "sentence into shingles" },
271 new int[] { 0, 0, 7, 7, 14, 14, 19 },
272 new int[] { 18, 27, 27, 32, 32, 41, 41 },
273 new int[] { 1, 0, 1, 0, 1, 0, 1 });
276 public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
277 ShingleAnalyzerWrapper analyzer
278 = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 3);
279 assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
280 new String[] { "please", "please divide this",
281 "divide", "divide this sentence",
282 "this", "this sentence into",
283 "sentence", "sentence into shingles",
286 new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 33 },
287 new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 },
288 new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });
289 analyzer.setOutputUnigrams(false);
290 assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
291 new String[] { "please divide this",
292 "divide this sentence",
293 "this sentence into",
294 "sentence into shingles" },
295 new int[] { 0, 7, 14, 19 },
296 new int[] { 18, 27, 32, 41 },
297 new int[] { 1, 1, 1, 1 });
300 public void testNoTokenSeparator() throws Exception {
301 ShingleAnalyzerWrapper analyzer
302 = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
303 analyzer.setTokenSeparator("");
304 assertAnalyzesToReuse(analyzer, "please divide into shingles",
305 new String[] { "please", "pleasedivide",
306 "divide", "divideinto",
307 "into", "intoshingles",
309 new int[] { 0, 0, 7, 7, 14, 14, 19 },
310 new int[] { 6, 13, 13, 18, 18, 27, 27 },
311 new int[] { 1, 0, 1, 0, 1, 0, 1 });
312 analyzer.setOutputUnigrams(false);
313 assertAnalyzesToReuse(analyzer, "please divide into shingles",
314 new String[] { "pleasedivide",
317 new int[] { 0, 7, 14 },
318 new int[] { 13, 18, 27 },
319 new int[] { 1, 1, 1 });
322 public void testNullTokenSeparator() throws Exception {
323 ShingleAnalyzerWrapper analyzer
324 = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
325 analyzer.setTokenSeparator(null);
326 assertAnalyzesToReuse(analyzer, "please divide into shingles",
327 new String[] { "please", "pleasedivide",
328 "divide", "divideinto",
329 "into", "intoshingles",
331 new int[] { 0, 0, 7, 7, 14, 14, 19 },
332 new int[] { 6, 13, 13, 18, 18, 27, 27 },
333 new int[] { 1, 0, 1, 0, 1, 0, 1 });
334 analyzer.setOutputUnigrams(false);
335 assertAnalyzesToReuse(analyzer, "please divide into shingles",
336 new String[] { "pleasedivide",
339 new int[] { 0, 7, 14 },
340 new int[] { 13, 18, 27 },
341 new int[] { 1, 1, 1 });
343 public void testAltTokenSeparator() throws Exception {
344 ShingleAnalyzerWrapper analyzer
345 = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
346 analyzer.setTokenSeparator("<SEP>");
347 assertAnalyzesToReuse(analyzer, "please divide into shingles",
348 new String[] { "please", "please<SEP>divide",
349 "divide", "divide<SEP>into",
350 "into", "into<SEP>shingles",
352 new int[] { 0, 0, 7, 7, 14, 14, 19 },
353 new int[] { 6, 13, 13, 18, 18, 27, 27 },
354 new int[] { 1, 0, 1, 0, 1, 0, 1 });
355 analyzer.setOutputUnigrams(false);
356 assertAnalyzesToReuse(analyzer, "please divide into shingles",
357 new String[] { "please<SEP>divide",
359 "into<SEP>shingles" },
360 new int[] { 0, 7, 14 },
361 new int[] { 13, 18, 27 },
362 new int[] { 1, 1, 1 });
365 public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
366 ShingleAnalyzerWrapper analyzer
367 = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
368 analyzer.setOutputUnigrams(false);
369 analyzer.setOutputUnigramsIfNoShingles(true);
370 assertAnalyzesToReuse(analyzer, "please",
371 new String[] { "please" },