X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/analysis/TestClassicAnalyzer.java diff --git a/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/analysis/TestClassicAnalyzer.java b/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/analysis/TestClassicAnalyzer.java new file mode 100644 index 0000000..1987e46 --- /dev/null +++ b/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/analysis/TestClassicAnalyzer.java @@ -0,0 +1,309 @@ +package org.apache.lucene.analysis; + +import org.apache.lucene.analysis.standard.ClassicAnalyzer; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.Version; + +import java.io.IOException; +import java.util.Arrays; + + +/** + * Copyright 2004 The Apache Software Foundation + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestClassicAnalyzer extends BaseTokenStreamTestCase { + + private Analyzer a = new ClassicAnalyzer(TEST_VERSION_CURRENT); + + public void testMaxTermLength() throws Exception { + ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT); + sa.setMaxTokenLength(5); + assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}); + } + + public void testMaxTermLength2() throws Exception { + ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT); + assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"}); + sa.setMaxTokenLength(5); + + assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, new int[]{1, 1, 2, 1}); + } + + public void testMaxTermLength3() throws Exception { + char[] chars = new char[255]; + for(int i=0;i<255;i++) + chars[i] = 'a'; + String longTerm = new String(chars, 0, 255); + + assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm, "xy", "z"}); + assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"}); + } + + public void testAlphanumeric() throws Exception { + // alphanumeric tokens + assertAnalyzesTo(a, "B2B", new String[]{"b2b"}); + assertAnalyzesTo(a, "2B", new String[]{"2b"}); + } + + public void testUnderscores() throws Exception { + // underscores are delimiters, but not in email addresses (below) + assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"}); + assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"}); + } + + public void testDelimiters() throws Exception { + // other delimiters: "-", "/", "," + assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"}); + assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"}); + assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"}); + } + + public void testApostrophes() throws Exception { + // internal apostrophes: O'Reilly, you're, O'Reilly's + // possessives are actually removed by StardardFilter, not the tokenizer + assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"}); + assertAnalyzesTo(a, "you're", new String[]{"you're"}); + assertAnalyzesTo(a, "she's", new String[]{"she"}); + assertAnalyzesTo(a, "Jim's", new String[]{"jim"}); + assertAnalyzesTo(a, "don't", new String[]{"don't"}); + assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"}); + } + + public void testTSADash() throws Exception { + // t and s had been stopwords in Lucene <= 2.0, which made it impossible + // to correctly search for these terms: + assertAnalyzesTo(a, "s-class", new String[]{"s", "class"}); + assertAnalyzesTo(a, "t-com", new String[]{"t", "com"}); + // 'a' is still a stopword: + assertAnalyzesTo(a, "a-class", new String[]{"class"}); + } + + public void testCompanyNames() throws Exception { + // company names + assertAnalyzesTo(a, "AT&T", new String[]{"at&t"}); + assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"}); + } + + public void testLucene1140() throws Exception { + try { + ClassicAnalyzer analyzer = new ClassicAnalyzer(TEST_VERSION_CURRENT); + assertAnalyzesTo(analyzer, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "" }); + } catch (NullPointerException e) { + fail("Should not throw an NPE and it did"); + } + + } + + public void testDomainNames() throws Exception { + // Current lucene should not show the bug + ClassicAnalyzer a2 = new ClassicAnalyzer(TEST_VERSION_CURRENT); + + // domain names + assertAnalyzesTo(a2, "www.nutch.org", new String[]{"www.nutch.org"}); + //Notice the trailing . See https://issues.apache.org/jira/browse/LUCENE-1068. + // the following should be recognized as HOST: + assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "" }); + + // 2.3 should show the bug + a2 = new ClassicAnalyzer(org.apache.lucene.util.Version.LUCENE_23); + assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "" }); + + // 2.4 should not show the bug + a2 = new ClassicAnalyzer(Version.LUCENE_24); + assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "" }); + } + + public void testEMailAddresses() throws Exception { + // email addresses, possibly with underscores, periods, etc + assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"}); + assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"}); + assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"}); + } + + public void testNumeric() throws Exception { + // floating point, serial, model numbers, ip addresses, etc. + // every other segment must have at least one digit + assertAnalyzesTo(a, "21.35", new String[]{"21.35"}); + assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"}); + assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); + assertAnalyzesTo(a, "1-2-3", new String[]{"1-2-3"}); + assertAnalyzesTo(a, "a1-b2-c3", new String[]{"a1-b2-c3"}); + assertAnalyzesTo(a, "a1-b-c3", new String[]{"a1-b-c3"}); + } + + public void testTextWithNumbers() throws Exception { + // numbers + assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"}); + } + + public void testVariousText() throws Exception { + // various + assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"}); + assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"}); + assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"}); + assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"}); + } + + public void testAcronyms() throws Exception { + // acronyms have their dots stripped + assertAnalyzesTo(a, "U.S.A.", new String[]{"usa"}); + } + + public void testCPlusPlusHash() throws Exception { + // It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens. + assertAnalyzesTo(a, "C++", new String[]{"c"}); + assertAnalyzesTo(a, "C#", new String[]{"c"}); + } + + public void testKorean() throws Exception { + // Korean words + assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"}); + } + + // Compliance with the "old" JavaCC-based analyzer, see: + // https://issues.apache.org/jira/browse/LUCENE-966#action_12516752 + + public void testComplianceFileName() throws Exception { + assertAnalyzesTo(a, "2004.jpg", + new String[]{"2004.jpg"}, + new String[]{""}); + } + + public void testComplianceNumericIncorrect() throws Exception { + assertAnalyzesTo(a, "62.46", + new String[]{"62.46"}, + new String[]{""}); + } + + public void testComplianceNumericLong() throws Exception { + assertAnalyzesTo(a, "978-0-94045043-1", + new String[]{"978-0-94045043-1"}, + new String[]{""}); + } + + public void testComplianceNumericFile() throws Exception { + assertAnalyzesTo( + a, + "78academyawards/rules/rule02.html", + new String[]{"78academyawards/rules/rule02.html"}, + new String[]{""}); + } + + public void testComplianceNumericWithUnderscores() throws Exception { + assertAnalyzesTo( + a, + "2006-03-11t082958z_01_ban130523_rtridst_0_ozabs", + new String[]{"2006-03-11t082958z_01_ban130523_rtridst_0_ozabs"}, + new String[]{""}); + } + + public void testComplianceNumericWithDash() throws Exception { + assertAnalyzesTo(a, "mid-20th", new String[]{"mid-20th"}, + new String[]{""}); + } + + public void testComplianceManyTokens() throws Exception { + assertAnalyzesTo( + a, + "/money.cnn.com/magazines/fortune/fortune_archive/2007/03/19/8402357/index.htm " + + "safari-0-sheikh-zayed-grand-mosque.jpg", + new String[]{"money.cnn.com", "magazines", "fortune", + "fortune", "archive/2007/03/19/8402357", "index.htm", + "safari-0-sheikh", "zayed", "grand", "mosque.jpg"}, + new String[]{"", "", "", + "", "", "", "", "", + "", ""}); + } + + public void testJava14BWCompatibility() throws Exception { + ClassicAnalyzer sa = new ClassicAnalyzer(Version.LUCENE_30); + assertAnalyzesTo(sa, "test\u02C6test", new String[] { "test", "test" }); + } + + /** + * Make sure we skip wicked long terms. + */ + public void testWickedLongTerm() throws IOException { + RAMDirectory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( + TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT))); + + char[] chars = new char[IndexWriter.MAX_TERM_LENGTH]; + Arrays.fill(chars, 'x'); + Document doc = new Document(); + final String bigTerm = new String(chars); + + // This produces a too-long term: + String contents = "abc xyz x" + bigTerm + " another term"; + doc.add(new Field("content", contents, Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + + // Make sure we can add another normal document + doc = new Document(); + doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + writer.close(); + + IndexReader reader = IndexReader.open(dir, true); + + // Make sure all terms < max size were indexed + assertEquals(2, reader.docFreq(new Term("content", "abc"))); + assertEquals(1, reader.docFreq(new Term("content", "bbb"))); + assertEquals(1, reader.docFreq(new Term("content", "term"))); + assertEquals(1, reader.docFreq(new Term("content", "another"))); + + // Make sure position is still incremented when + // massive term is skipped: + TermPositions tps = reader.termPositions(new Term("content", "another")); + assertTrue(tps.next()); + assertEquals(1, tps.freq()); + assertEquals(3, tps.nextPosition()); + + // Make sure the doc that has the massive term is in + // the index: + assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs()); + + reader.close(); + + // Make sure we can add a document with exactly the + // maximum length term, and search on that term: + doc = new Document(); + doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.ANALYZED)); + ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT); + sa.setMaxTokenLength(100000); + writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa)); + writer.addDocument(doc); + writer.close(); + reader = IndexReader.open(dir, true); + assertEquals(1, reader.docFreq(new Term("content", bigTerm))); + reader.close(); + + dir.close(); + } + + /** blast some random strings through the analyzer */ + public void testRandomStrings() throws Exception { + checkRandomData(random, new ClassicAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER); + } +}