lucene-java-3.4.0/lucene/backwards/src/test/org/apache/lucene/analysis/TestClassicAnalyzer.java

   1 package org.apache.lucene.analysis;
   2
   3 import org.apache.lucene.analysis.standard.ClassicAnalyzer;
   4
   5 import org.apache.lucene.document.Document;
   6 import org.apache.lucene.document.Field;
   7 import org.apache.lucene.index.IndexReader;
   8 import org.apache.lucene.index.IndexWriter;
   9 import org.apache.lucene.index.IndexWriterConfig;
  10 import org.apache.lucene.index.Term;
  11 import org.apache.lucene.index.TermPositions;
  12 import org.apache.lucene.store.RAMDirectory;
  13 import org.apache.lucene.util.Version;
  14
  15 import java.io.IOException;
  16 import java.util.Arrays;
  17
  18
  19 /**
  20  * Copyright 2004 The Apache Software Foundation
  21  * <p/>
  22  * Licensed under the Apache License, Version 2.0 (the "License");
  23  * you may not use this file except in compliance with the License.
  24  * You may obtain a copy of the License at
  25  * <p/>
  26  * http://www.apache.org/licenses/LICENSE-2.0
  27  * <p/>
  28  * Unless required by applicable law or agreed to in writing, software
  29  * distributed under the License is distributed on an "AS IS" BASIS,
  30  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  31  * See the License for the specific language governing permissions and
  32  * limitations under the License.
  33  */
  34
  35 public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
  36
  37   private Analyzer a = new ClassicAnalyzer(TEST_VERSION_CURRENT);
  38
  39   public void testMaxTermLength() throws Exception {
  40     ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
  41     sa.setMaxTokenLength(5);
  42     assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"});
  43   }
  44
  45   public void testMaxTermLength2() throws Exception {
  46     ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
  47     assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"});
  48     sa.setMaxTokenLength(5);
  49
  50     assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, new int[]{1, 1, 2, 1});
  51   }
  52
  53   public void testMaxTermLength3() throws Exception {
  54     char[] chars = new char[255];
  55     for(int i=0;i<255;i++)
  56       chars[i] = 'a';
  57     String longTerm = new String(chars, 0, 255);
  58
  59     assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm, "xy", "z"});
  60     assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"});
  61   }
  62
  63   public void testAlphanumeric() throws Exception {
  64     // alphanumeric tokens
  65     assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
  66     assertAnalyzesTo(a, "2B", new String[]{"2b"});
  67   }
  68
  69   public void testUnderscores() throws Exception {
  70     // underscores are delimiters, but not in email addresses (below)
  71     assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"});
  72     assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"});
  73   }
  74
  75   public void testDelimiters() throws Exception {
  76     // other delimiters: "-", "/", ","
  77     assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
  78     assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
  79     assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
  80   }
  81
  82   public void testApostrophes() throws Exception {
  83     // internal apostrophes: O'Reilly, you're, O'Reilly's
  84     // possessives are actually removed by StardardFilter, not the tokenizer
  85     assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
  86     assertAnalyzesTo(a, "you're", new String[]{"you're"});
  87     assertAnalyzesTo(a, "she's", new String[]{"she"});
  88     assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
  89     assertAnalyzesTo(a, "don't", new String[]{"don't"});
  90     assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
  91   }
  92
  93   public void testTSADash() throws Exception {
  94     // t and s had been stopwords in Lucene <= 2.0, which made it impossible
  95     // to correctly search for these terms:
  96     assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
  97     assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
  98     // 'a' is still a stopword:
  99     assertAnalyzesTo(a, "a-class", new String[]{"class"});
 100   }
 101
 102   public void testCompanyNames() throws Exception {
 103     // company names
 104     assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
 105     assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
 106   }
 107
 108   public void testLucene1140() throws Exception {
 109     try {
 110       ClassicAnalyzer analyzer = new ClassicAnalyzer(TEST_VERSION_CURRENT);
 111       assertAnalyzesTo(analyzer, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
 112     } catch (NullPointerException e) {
 113       fail("Should not throw an NPE and it did");
 114     }
 115
 116   }
 117
 118   public void testDomainNames() throws Exception {
 119     // Current lucene should not show the bug
 120     ClassicAnalyzer a2 = new ClassicAnalyzer(TEST_VERSION_CURRENT);
 121
 122     // domain names
 123     assertAnalyzesTo(a2, "www.nutch.org", new String[]{"www.nutch.org"});
 124     //Notice the trailing .  See https://issues.apache.org/jira/browse/LUCENE-1068.
 125     // the following should be recognized as HOST:
 126     assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
 127
 128     // 2.3 should show the bug
 129     a2 = new ClassicAnalyzer(org.apache.lucene.util.Version.LUCENE_23);
 130     assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });
 131
 132     // 2.4 should not show the bug
 133     a2 = new ClassicAnalyzer(Version.LUCENE_24);
 134     assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
 135   }
 136
 137   public void testEMailAddresses() throws Exception {
 138     // email addresses, possibly with underscores, periods, etc
 139     assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"});
 140     assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"});
 141     assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"});
 142   }
 143
 144   public void testNumeric() throws Exception {
 145     // floating point, serial, model numbers, ip addresses, etc.
 146     // every other segment must have at least one digit
 147     assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
 148     assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
 149     assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
 150     assertAnalyzesTo(a, "1-2-3", new String[]{"1-2-3"});
 151     assertAnalyzesTo(a, "a1-b2-c3", new String[]{"a1-b2-c3"});
 152     assertAnalyzesTo(a, "a1-b-c3", new String[]{"a1-b-c3"});
 153   }
 154
 155   public void testTextWithNumbers() throws Exception {
 156     // numbers
 157     assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
 158   }
 159
 160   public void testVariousText() throws Exception {
 161     // various
 162     assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
 163     assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
 164     assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
 165     assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
 166   }
 167
 168   public void testAcronyms() throws Exception {
 169     // acronyms have their dots stripped
 170     assertAnalyzesTo(a, "U.S.A.", new String[]{"usa"});
 171   }
 172
 173   public void testCPlusPlusHash() throws Exception {
 174     // It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
 175     assertAnalyzesTo(a, "C++", new String[]{"c"});
 176     assertAnalyzesTo(a, "C#", new String[]{"c"});
 177   }
 178
 179   public void testKorean() throws Exception {
 180     // Korean words
 181     assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
 182   }
 183
 184   // Compliance with the "old" JavaCC-based analyzer, see:
 185   // https://issues.apache.org/jira/browse/LUCENE-966#action_12516752
 186
 187   public void testComplianceFileName() throws Exception {
 188     assertAnalyzesTo(a, "2004.jpg",
 189             new String[]{"2004.jpg"},
 190             new String[]{"<HOST>"});
 191   }
 192
 193   public void testComplianceNumericIncorrect() throws Exception {
 194     assertAnalyzesTo(a, "62.46",
 195             new String[]{"62.46"},
 196             new String[]{"<HOST>"});
 197   }
 198
 199   public void testComplianceNumericLong() throws Exception {
 200     assertAnalyzesTo(a, "978-0-94045043-1",
 201             new String[]{"978-0-94045043-1"},
 202             new String[]{"<NUM>"});
 203   }
 204
 205   public void testComplianceNumericFile() throws Exception {
 206     assertAnalyzesTo(
 207             a,
 208             "78academyawards/rules/rule02.html",
 209             new String[]{"78academyawards/rules/rule02.html"},
 210             new String[]{"<NUM>"});
 211   }
 212
 213   public void testComplianceNumericWithUnderscores() throws Exception {
 214     assertAnalyzesTo(
 215             a,
 216             "2006-03-11t082958z_01_ban130523_rtridst_0_ozabs",
 217             new String[]{"2006-03-11t082958z_01_ban130523_rtridst_0_ozabs"},
 218             new String[]{"<NUM>"});
 219   }
 220
 221   public void testComplianceNumericWithDash() throws Exception {
 222     assertAnalyzesTo(a, "mid-20th", new String[]{"mid-20th"},
 223             new String[]{"<NUM>"});
 224   }
 225
 226   public void testComplianceManyTokens() throws Exception {
 227     assertAnalyzesTo(
 228             a,
 229             "/money.cnn.com/magazines/fortune/fortune_archive/2007/03/19/8402357/index.htm "
 230                     + "safari-0-sheikh-zayed-grand-mosque.jpg",
 231             new String[]{"money.cnn.com", "magazines", "fortune",
 232                     "fortune", "archive/2007/03/19/8402357", "index.htm",
 233                     "safari-0-sheikh", "zayed", "grand", "mosque.jpg"},
 234             new String[]{"<HOST>", "<ALPHANUM>", "<ALPHANUM>",
 235                     "<ALPHANUM>", "<NUM>", "<HOST>", "<NUM>", "<ALPHANUM>",
 236                     "<ALPHANUM>", "<HOST>"});
 237   }
 238
 239   public void testJava14BWCompatibility() throws Exception {
 240     ClassicAnalyzer sa = new ClassicAnalyzer(Version.LUCENE_30);
 241     assertAnalyzesTo(sa, "test\u02C6test", new String[] { "test", "test" });
 242   }
 243
 244   /**
 245    * Make sure we skip wicked long terms.
 246    */
 247   public void testWickedLongTerm() throws IOException {
 248     RAMDirectory dir = new RAMDirectory();
 249     IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
 250       TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT)));
 251
 252     char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
 253     Arrays.fill(chars, 'x');
 254     Document doc = new Document();
 255     final String bigTerm = new String(chars);
 256
 257     // This produces a too-long term:
 258     String contents = "abc xyz x" + bigTerm + " another term";
 259     doc.add(new Field("content", contents, Field.Store.NO, Field.Index.ANALYZED));
 260     writer.addDocument(doc);
 261
 262     // Make sure we can add another normal document
 263     doc = new Document();
 264     doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.ANALYZED));
 265     writer.addDocument(doc);
 266     writer.close();
 267
 268     IndexReader reader = IndexReader.open(dir, true);
 269
 270     // Make sure all terms < max size were indexed
 271     assertEquals(2, reader.docFreq(new Term("content", "abc")));
 272     assertEquals(1, reader.docFreq(new Term("content", "bbb")));
 273     assertEquals(1, reader.docFreq(new Term("content", "term")));
 274     assertEquals(1, reader.docFreq(new Term("content", "another")));
 275
 276     // Make sure position is still incremented when
 277     // massive term is skipped:
 278     TermPositions tps = reader.termPositions(new Term("content", "another"));
 279     assertTrue(tps.next());
 280     assertEquals(1, tps.freq());
 281     assertEquals(3, tps.nextPosition());
 282
 283     // Make sure the doc that has the massive term is in
 284     // the index:
 285     assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());
 286
 287     reader.close();
 288
 289     // Make sure we can add a document with exactly the
 290     // maximum length term, and search on that term:
 291     doc = new Document();
 292     doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.ANALYZED));
 293     ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
 294     sa.setMaxTokenLength(100000);
 295     writer  = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa));
 296     writer.addDocument(doc);
 297     writer.close();
 298     reader = IndexReader.open(dir, true);
 299     assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
 300     reader.close();
 301
 302     dir.close();
 303   }
 304
 305   /** blast some random strings through the analyzer */
 306   public void testRandomStrings() throws Exception {
 307     checkRandomData(random, new ClassicAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
 308   }
 309 }