lucene-java-3.4.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java

   1 package org.apache.lucene.analysis.shingle;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.Reader;
  21 import java.io.StringReader;
  22
  23 import org.apache.lucene.analysis.Analyzer;
  24 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  25 import org.apache.lucene.analysis.MockAnalyzer;
  26 import org.apache.lucene.analysis.MockTokenizer;
  27 import org.apache.lucene.analysis.TokenStream;
  28 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  29 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  30 import org.apache.lucene.document.Document;
  31 import org.apache.lucene.document.Field;
  32 import org.apache.lucene.index.IndexWriter;
  33 import org.apache.lucene.index.IndexWriterConfig;
  34 import org.apache.lucene.index.Term;
  35 import org.apache.lucene.queryParser.QueryParser;
  36 import org.apache.lucene.search.BooleanClause;
  37 import org.apache.lucene.search.BooleanQuery;
  38 import org.apache.lucene.search.IndexSearcher;
  39 import org.apache.lucene.search.PhraseQuery;
  40 import org.apache.lucene.search.Query;
  41 import org.apache.lucene.search.ScoreDoc;
  42 import org.apache.lucene.search.TermQuery;
  43 import org.apache.lucene.store.Directory;
  44 import org.apache.lucene.store.RAMDirectory;
  45
  46 /**
  47  * A test class for ShingleAnalyzerWrapper as regards queries and scoring.
  48  */
  49 public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
  50
  51   public IndexSearcher searcher;
  52
  53   /**
  54    * Set up a new index in RAM with three test phrases and the supplied Analyzer.
  55    *
  56    * @param analyzer the analyzer to use
  57    * @return an indexSearcher on the test index.
  58    * @throws Exception if an error occurs with index writer or searcher
  59    */
  60   public IndexSearcher setUpSearcher(Analyzer analyzer) throws Exception {
  61     Directory dir = new RAMDirectory();
  62     IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
  63
  64     Document doc;
  65     doc = new Document();
  66     doc.add(new Field("content", "please divide this sentence into shingles",
  67             Field.Store.YES,Field.Index.ANALYZED));
  68     writer.addDocument(doc);
  69
  70     doc = new Document();
  71     doc.add(new Field("content", "just another test sentence",
  72                       Field.Store.YES,Field.Index.ANALYZED));
  73     writer.addDocument(doc);
  74
  75     doc = new Document();
  76     doc.add(new Field("content", "a sentence which contains no test",
  77                       Field.Store.YES,Field.Index.ANALYZED));
  78     writer.addDocument(doc);
  79
  80     writer.close();
  81
  82     return new IndexSearcher(dir, true);
  83   }
  84
  85   protected ScoreDoc[] queryParsingTest(Analyzer analyzer, String qs) throws Exception {
  86     searcher = setUpSearcher(analyzer);
  87
  88     QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "content", analyzer);
  89
  90     Query q = qp.parse(qs);
  91
  92     return searcher.search(q, null, 1000).scoreDocs;
  93   }
  94
  95   protected void compareRanks(ScoreDoc[] hits, int[] ranks) throws Exception {
  96     assertEquals(ranks.length, hits.length);
  97     for (int i = 0; i < ranks.length; i++) {
  98       assertEquals(ranks[i], hits[i].doc);
  99     }
 100   }
 101
 102   /*
 103    * Will not work on an index without unigrams, since QueryParser automatically
 104    * tokenizes on whitespace.
 105    */
 106   public void testShingleAnalyzerWrapperQueryParsing() throws Exception {
 107     ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
 108                                      (new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2),
 109                                  "test sentence");
 110     int[] ranks = new int[] { 1, 2, 0 };
 111     compareRanks(hits, ranks);
 112   }
 113
 114   /*
 115    * This one fails with an exception.
 116    */
 117   public void testShingleAnalyzerWrapperPhraseQueryParsingFails() throws Exception {
 118     ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
 119                                      (new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2),
 120                                  "\"this sentence\"");
 121     int[] ranks = new int[] { 0 };
 122     compareRanks(hits, ranks);
 123   }
 124
 125   /*
 126    * This one works, actually.
 127    */
 128   public void testShingleAnalyzerWrapperPhraseQueryParsing() throws Exception {
 129     ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
 130                                      (new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2),
 131                                  "\"test sentence\"");
 132     int[] ranks = new int[] { 1 };
 133     compareRanks(hits, ranks);
 134   }
 135
 136   /*
 137    * Same as above, is tokenized without using the analyzer.
 138    */
 139   public void testShingleAnalyzerWrapperRequiredQueryParsing() throws Exception {
 140     ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
 141                                      (new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2),
 142                                  "+test +sentence");
 143     int[] ranks = new int[] { 1, 2 };
 144     compareRanks(hits, ranks);
 145   }
 146
 147   /*
 148    * This shows how to construct a phrase query containing shingles.
 149    */
 150   public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
 151     Analyzer analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2);
 152     searcher = setUpSearcher(analyzer);
 153
 154     PhraseQuery q = new PhraseQuery();
 155
 156     TokenStream ts = analyzer.tokenStream("content",
 157                                           new StringReader("this sentence"));
 158     int j = -1;
 159
 160     PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
 161     CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
 162
 163     ts.reset();
 164     while (ts.incrementToken()) {
 165       j += posIncrAtt.getPositionIncrement();
 166       String termText = termAtt.toString();
 167       q.add(new Term("content", termText), j);
 168     }
 169
 170     ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
 171     int[] ranks = new int[] { 0 };
 172     compareRanks(hits, ranks);
 173   }
 174
 175   /*
 176    * How to construct a boolean query with shingles. A query like this will
 177    * implicitly score those documents higher that contain the words in the query
 178    * in the right order and adjacent to each other.
 179    */
 180   public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
 181     Analyzer analyzer = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2);
 182     searcher = setUpSearcher(analyzer);
 183
 184     BooleanQuery q = new BooleanQuery();
 185
 186     TokenStream ts = analyzer.tokenStream("content",
 187                                           new StringReader("test sentence"));
 188
 189     CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
 190
 191     ts.reset();
 192
 193     while (ts.incrementToken()) {
 194       String termText =  termAtt.toString();
 195       q.add(new TermQuery(new Term("content", termText)),
 196             BooleanClause.Occur.SHOULD);
 197     }
 198
 199     ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
 200     int[] ranks = new int[] { 1, 2, 0 };
 201     compareRanks(hits, ranks);
 202   }
 203
 204   public void testReusableTokenStream() throws Exception {
 205     Analyzer a = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2);
 206     assertAnalyzesToReuse(a, "please divide into shingles",
 207         new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
 208         new int[] { 0, 0, 7, 7, 14, 14, 19 },
 209         new int[] { 6, 13, 13, 18, 18, 27, 27 },
 210         new int[] { 1, 0, 1, 0, 1, 0, 1 });
 211     assertAnalyzesToReuse(a, "divide me up again",
 212         new String[] { "divide", "divide me", "me", "me up", "up", "up again", "again" },
 213         new int[] { 0, 0, 7, 7, 10, 10, 13 },
 214         new int[] { 6, 9, 9, 12, 12, 18, 18 },
 215         new int[] { 1, 0, 1, 0, 1, 0, 1 });
 216   }
 217
 218   /*
 219    * analyzer that does not support reuse
 220    * it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even.
 221    */
 222   private class NonreusableAnalyzer extends Analyzer {
 223     int invocationCount = 0;
 224     @Override
 225     public TokenStream tokenStream(String fieldName, Reader reader) {
 226       if (++invocationCount % 2 == 0)
 227         return new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
 228       else
 229         return new MockTokenizer(reader, MockTokenizer.SIMPLE, false);
 230     }
 231   }
 232
 233   public void testWrappedAnalyzerDoesNotReuse() throws Exception {
 234     Analyzer a = new ShingleAnalyzerWrapper(new NonreusableAnalyzer());
 235     assertAnalyzesToReuse(a, "please divide into shingles.",
 236         new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
 237         new int[] { 0, 0, 7, 7, 14, 14, 19 },
 238         new int[] { 6, 13, 13, 18, 18, 27, 27 },
 239         new int[] { 1, 0, 1, 0, 1, 0, 1 });
 240     assertAnalyzesToReuse(a, "please divide into shingles.",
 241         new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles.", "shingles." },
 242         new int[] { 0, 0, 7, 7, 14, 14, 19 },
 243         new int[] { 6, 13, 13, 18, 18, 28, 28 },
 244         new int[] { 1, 0, 1, 0, 1, 0, 1 });
 245     assertAnalyzesToReuse(a, "please divide into shingles.",
 246         new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
 247         new int[] { 0, 0, 7, 7, 14, 14, 19 },
 248         new int[] { 6, 13, 13, 18, 18, 27, 27 },
 249         new int[] { 1, 0, 1, 0, 1, 0, 1 });
 250   }
 251
 252   public void testNonDefaultMinShingleSize() throws Exception {
 253     ShingleAnalyzerWrapper analyzer
 254       = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 4);
 255     assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
 256                           new String[] { "please",   "please divide this",   "please divide this sentence",
 257                                          "divide",   "divide this sentence", "divide this sentence into",
 258                                          "this",     "this sentence into",   "this sentence into shingles",
 259                                          "sentence", "sentence into shingles",
 260                                          "into",
 261                                          "shingles" },
 262                           new int[] { 0,  0,  0,  7,  7,  7, 14, 14, 14, 19, 19, 28, 33 },
 263                           new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 },
 264                           new int[] { 1,  0,  0,  1,  0,  0,  1,  0,  0,  1,  0,  1,  1 });
 265     analyzer.setOutputUnigrams(false);
 266     assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
 267                           new String[] { "please divide this",   "please divide this sentence",
 268                                          "divide this sentence", "divide this sentence into",
 269                                          "this sentence into",   "this sentence into shingles",
 270                                          "sentence into shingles" },
 271                           new int[] {  0,  0,  7,  7, 14, 14, 19 },
 272                           new int[] { 18, 27, 27, 32, 32, 41, 41 },
 273                           new int[] {  1,  0,  1,  0,  1,  0,  1 });
 274   }
 275
 276   public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
 277     ShingleAnalyzerWrapper analyzer
 278       = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 3);
 279     assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
 280                           new String[] { "please",   "please divide this",
 281                                          "divide",   "divide this sentence",
 282                                          "this",     "this sentence into",
 283                                          "sentence", "sentence into shingles",
 284                                          "into",
 285                                          "shingles" },
 286                           new int[] { 0,  0,  7,  7, 14, 14, 19, 19, 28, 33 },
 287                           new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 },
 288                           new int[] { 1,  0,  1,  0,  1,  0,  1,  0,  1,  1 });
 289     analyzer.setOutputUnigrams(false);
 290     assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
 291                           new String[] { "please divide this",
 292                                          "divide this sentence",
 293                                          "this sentence into",
 294                                          "sentence into shingles" },
 295                           new int[] {  0,  7, 14, 19 },
 296                           new int[] { 18, 27, 32, 41 },
 297                           new int[] {  1,  1,  1,  1 });
 298   }
 299
 300   public void testNoTokenSeparator() throws Exception {
 301     ShingleAnalyzerWrapper analyzer
 302       = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
 303     analyzer.setTokenSeparator("");
 304     assertAnalyzesToReuse(analyzer, "please divide into shingles",
 305                           new String[] { "please", "pleasedivide",
 306                                          "divide", "divideinto",
 307                                          "into", "intoshingles",
 308                                          "shingles" },
 309                           new int[] { 0,  0,  7,  7, 14, 14, 19 },
 310                           new int[] { 6, 13, 13, 18, 18, 27, 27 },
 311                           new int[] { 1,  0,  1,  0,  1,  0,  1 });
 312     analyzer.setOutputUnigrams(false);
 313     assertAnalyzesToReuse(analyzer, "please divide into shingles",
 314                           new String[] { "pleasedivide",
 315                                          "divideinto",
 316                                          "intoshingles" },
 317                           new int[] {  0,  7, 14 },
 318                           new int[] { 13, 18, 27 },
 319                           new int[] {  1,  1,  1 });
 320   }
 321
 322   public void testNullTokenSeparator() throws Exception {
 323     ShingleAnalyzerWrapper analyzer
 324       = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
 325     analyzer.setTokenSeparator(null);
 326     assertAnalyzesToReuse(analyzer, "please divide into shingles",
 327                           new String[] { "please", "pleasedivide",
 328                                          "divide", "divideinto",
 329                                          "into", "intoshingles",
 330                                          "shingles" },
 331                           new int[] { 0,  0,  7,  7, 14, 14, 19 },
 332                           new int[] { 6, 13, 13, 18, 18, 27, 27 },
 333                           new int[] { 1,  0,  1,  0,  1,  0,  1 });
 334     analyzer.setOutputUnigrams(false);
 335     assertAnalyzesToReuse(analyzer, "please divide into shingles",
 336                           new String[] { "pleasedivide",
 337                                          "divideinto",
 338                                          "intoshingles" },
 339                           new int[] {  0,  7, 14 },
 340                           new int[] { 13, 18, 27 },
 341                           new int[] {  1,  1,  1 });
 342   }
 343   public void testAltTokenSeparator() throws Exception {
 344     ShingleAnalyzerWrapper analyzer
 345       = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
 346     analyzer.setTokenSeparator("<SEP>");
 347     assertAnalyzesToReuse(analyzer, "please divide into shingles",
 348                           new String[] { "please", "please<SEP>divide",
 349                                          "divide", "divide<SEP>into",
 350                                          "into", "into<SEP>shingles",
 351                                          "shingles" },
 352                           new int[] { 0,  0,  7,  7, 14, 14, 19 },
 353                           new int[] { 6, 13, 13, 18, 18, 27, 27 },
 354                           new int[] { 1,  0,  1,  0,  1,  0,  1 });
 355     analyzer.setOutputUnigrams(false);
 356     assertAnalyzesToReuse(analyzer, "please divide into shingles",
 357                           new String[] { "please<SEP>divide",
 358                                          "divide<SEP>into",
 359                                          "into<SEP>shingles" },
 360                           new int[] {  0,  7, 14 },
 361                           new int[] { 13, 18, 27 },
 362                           new int[] {  1,  1,  1 });
 363   }
 364
 365   public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
 366     ShingleAnalyzerWrapper analyzer
 367       = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
 368     analyzer.setOutputUnigrams(false);
 369     analyzer.setOutputUnigramsIfNoShingles(true);
 370     assertAnalyzesToReuse(analyzer, "please",
 371                           new String[] { "please" },
 372                           new int[] { 0 },
 373                           new int[] { 6 },
 374                           new int[] { 1 });
 375   }
 376 }