X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/src/test/org/apache/lucene/search/TestPhraseQuery.java diff --git a/lucene-java-3.4.0/lucene/src/test/org/apache/lucene/search/TestPhraseQuery.java b/lucene-java-3.4.0/lucene/src/test/org/apache/lucene/search/TestPhraseQuery.java deleted file mode 100644 index 0d07f20..0000000 --- a/lucene-java-3.4.0/lucene/src/test/org/apache/lucene/search/TestPhraseQuery.java +++ /dev/null @@ -1,697 +0,0 @@ -package org.apache.lucene.search; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.analysis.*; -import org.apache.lucene.analysis.tokenattributes.*; -import org.apache.lucene.document.*; -import org.apache.lucene.index.*; -import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.queryParser.QueryParser; -import org.apache.lucene.store.*; -import org.apache.lucene.util.Version; -import org.apache.lucene.util._TestUtil; -import org.junit.AfterClass; -import org.junit.BeforeClass; - -import java.io.IOException; -import java.io.Reader; -import java.io.StringReader; -import java.util.List; -import java.util.ArrayList; -import java.util.Random; - -/** - * Tests {@link PhraseQuery}. - * - * @see TestPositionIncrement - */ -public class TestPhraseQuery extends LuceneTestCase { - - /** threshold for comparing floats */ - public static final float SCORE_COMP_THRESH = 1e-6f; - - private static IndexSearcher searcher; - private static IndexReader reader; - private PhraseQuery query; - private static Directory directory; - - @BeforeClass - public static void beforeClass() throws Exception { - directory = newDirectory(); - Analyzer analyzer = new Analyzer() { - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - return new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); - } - - @Override - public int getPositionIncrementGap(String fieldName) { - return 100; - } - }; - RandomIndexWriter writer = new RandomIndexWriter(random, directory, analyzer); - - Document doc = new Document(); - doc.add(newField("field", "one two three four five", Field.Store.YES, Field.Index.ANALYZED)); - doc.add(newField("repeated", "this is a repeated field - first part", Field.Store.YES, Field.Index.ANALYZED)); - Fieldable repeatedField = newField("repeated", "second part of a repeated field", Field.Store.YES, Field.Index.ANALYZED); - doc.add(repeatedField); - doc.add(newField("palindrome", "one two three two one", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - - doc = new Document(); - doc.add(newField("nonexist", "phrase exist notexist exist found", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - - doc = new Document(); - doc.add(newField("nonexist", "phrase exist notexist exist found", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - - reader = writer.getReader(); - writer.close(); - - searcher = newSearcher(reader); - } - - @Override - public void setUp() throws Exception { - super.setUp(); - query = new PhraseQuery(); - } - - @AfterClass - public static void afterClass() throws Exception { - searcher.close(); - searcher = null; - reader.close(); - reader = null; - directory.close(); - directory = null; - } - - public void testNotCloseEnough() throws Exception { - query.setSlop(2); - query.add(new Term("field", "one")); - query.add(new Term("field", "five")); - ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(0, hits.length); - QueryUtils.check(random, query,searcher); - } - - public void testBarelyCloseEnough() throws Exception { - query.setSlop(3); - query.add(new Term("field", "one")); - query.add(new Term("field", "five")); - ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(1, hits.length); - QueryUtils.check(random, query,searcher); - } - - /** - * Ensures slop of 0 works for exact matches, but not reversed - */ - public void testExact() throws Exception { - // slop is zero by default - query.add(new Term("field", "four")); - query.add(new Term("field", "five")); - ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("exact match", 1, hits.length); - QueryUtils.check(random, query,searcher); - - - query = new PhraseQuery(); - query.add(new Term("field", "two")); - query.add(new Term("field", "one")); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("reverse not exact", 0, hits.length); - QueryUtils.check(random, query,searcher); - } - - public void testSlop1() throws Exception { - // Ensures slop of 1 works with terms in order. - query.setSlop(1); - query.add(new Term("field", "one")); - query.add(new Term("field", "two")); - ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("in order", 1, hits.length); - QueryUtils.check(random, query,searcher); - - - // Ensures slop of 1 does not work for phrases out of order; - // must be at least 2. - query = new PhraseQuery(); - query.setSlop(1); - query.add(new Term("field", "two")); - query.add(new Term("field", "one")); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("reversed, slop not 2 or more", 0, hits.length); - QueryUtils.check(random, query,searcher); - } - - /** - * As long as slop is at least 2, terms can be reversed - */ - public void testOrderDoesntMatter() throws Exception { - query.setSlop(2); // must be at least two for reverse order match - query.add(new Term("field", "two")); - query.add(new Term("field", "one")); - ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("just sloppy enough", 1, hits.length); - QueryUtils.check(random, query,searcher); - - - query = new PhraseQuery(); - query.setSlop(2); - query.add(new Term("field", "three")); - query.add(new Term("field", "one")); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("not sloppy enough", 0, hits.length); - QueryUtils.check(random, query,searcher); - - } - - /** - * slop is the total number of positional moves allowed - * to line up a phrase - */ - public void testMulipleTerms() throws Exception { - query.setSlop(2); - query.add(new Term("field", "one")); - query.add(new Term("field", "three")); - query.add(new Term("field", "five")); - ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("two total moves", 1, hits.length); - QueryUtils.check(random, query,searcher); - - - query = new PhraseQuery(); - query.setSlop(5); // it takes six moves to match this phrase - query.add(new Term("field", "five")); - query.add(new Term("field", "three")); - query.add(new Term("field", "one")); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("slop of 5 not close enough", 0, hits.length); - QueryUtils.check(random, query,searcher); - - - query.setSlop(6); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("slop of 6 just right", 1, hits.length); - QueryUtils.check(random, query,searcher); - - } - - public void testPhraseQueryWithStopAnalyzer() throws Exception { - Directory directory = newDirectory(); - StopAnalyzer stopAnalyzer = new StopAnalyzer(Version.LUCENE_24); - RandomIndexWriter writer = new RandomIndexWriter(random, directory, - newIndexWriterConfig( Version.LUCENE_24, stopAnalyzer)); - Document doc = new Document(); - doc.add(newField("field", "the stop words are here", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - IndexReader reader = writer.getReader(); - writer.close(); - - IndexSearcher searcher = newSearcher(reader); - - // valid exact phrase query - PhraseQuery query = new PhraseQuery(); - query.add(new Term("field","stop")); - query.add(new Term("field","words")); - ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(1, hits.length); - QueryUtils.check(random, query,searcher); - - - // StopAnalyzer as of 2.4 does not leave "holes", so this matches. - query = new PhraseQuery(); - query.add(new Term("field", "words")); - query.add(new Term("field", "here")); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(1, hits.length); - QueryUtils.check(random, query,searcher); - - - searcher.close(); - reader.close(); - directory.close(); - } - - public void testPhraseQueryInConjunctionScorer() throws Exception { - Directory directory = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, directory); - - Document doc = new Document(); - doc.add(newField("source", "marketing info", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - - doc = new Document(); - doc.add(newField("contents", "foobar", Field.Store.YES, Field.Index.ANALYZED)); - doc.add(newField("source", "marketing info", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - - IndexReader reader = writer.getReader(); - writer.close(); - - IndexSearcher searcher = newSearcher(reader); - - PhraseQuery phraseQuery = new PhraseQuery(); - phraseQuery.add(new Term("source", "marketing")); - phraseQuery.add(new Term("source", "info")); - ScoreDoc[] hits = searcher.search(phraseQuery, null, 1000).scoreDocs; - assertEquals(2, hits.length); - QueryUtils.check(random, phraseQuery,searcher); - - - TermQuery termQuery = new TermQuery(new Term("contents","foobar")); - BooleanQuery booleanQuery = new BooleanQuery(); - booleanQuery.add(termQuery, BooleanClause.Occur.MUST); - booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST); - hits = searcher.search(booleanQuery, null, 1000).scoreDocs; - assertEquals(1, hits.length); - QueryUtils.check(random, termQuery,searcher); - - - searcher.close(); - reader.close(); - - writer = new RandomIndexWriter(random, directory, - newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.CREATE)); - doc = new Document(); - doc.add(newField("contents", "map entry woo", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - - doc = new Document(); - doc.add(newField("contents", "woo map entry", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - - doc = new Document(); - doc.add(newField("contents", "map foobarword entry woo", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - - reader = writer.getReader(); - writer.close(); - - searcher = newSearcher(reader); - - termQuery = new TermQuery(new Term("contents","woo")); - phraseQuery = new PhraseQuery(); - phraseQuery.add(new Term("contents","map")); - phraseQuery.add(new Term("contents","entry")); - - hits = searcher.search(termQuery, null, 1000).scoreDocs; - assertEquals(3, hits.length); - hits = searcher.search(phraseQuery, null, 1000).scoreDocs; - assertEquals(2, hits.length); - - - booleanQuery = new BooleanQuery(); - booleanQuery.add(termQuery, BooleanClause.Occur.MUST); - booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST); - hits = searcher.search(booleanQuery, null, 1000).scoreDocs; - assertEquals(2, hits.length); - - booleanQuery = new BooleanQuery(); - booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST); - booleanQuery.add(termQuery, BooleanClause.Occur.MUST); - hits = searcher.search(booleanQuery, null, 1000).scoreDocs; - assertEquals(2, hits.length); - QueryUtils.check(random, booleanQuery,searcher); - - - searcher.close(); - reader.close(); - directory.close(); - } - - public void testSlopScoring() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); - - Document doc = new Document(); - doc.add(newField("field", "foo firstname lastname foo", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - - Document doc2 = new Document(); - doc2.add(newField("field", "foo firstname zzz lastname foo", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc2); - - Document doc3 = new Document(); - doc3.add(newField("field", "foo firstname zzz yyy lastname foo", Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc3); - - IndexReader reader = writer.getReader(); - writer.close(); - - IndexSearcher searcher = newSearcher(reader); - PhraseQuery query = new PhraseQuery(); - query.add(new Term("field", "firstname")); - query.add(new Term("field", "lastname")); - query.setSlop(Integer.MAX_VALUE); - ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(3, hits.length); - // Make sure that those matches where the terms appear closer to - // each other get a higher score: - assertEquals(0.71, hits[0].score, 0.01); - assertEquals(0, hits[0].doc); - assertEquals(0.44, hits[1].score, 0.01); - assertEquals(1, hits[1].doc); - assertEquals(0.31, hits[2].score, 0.01); - assertEquals(2, hits[2].doc); - QueryUtils.check(random, query,searcher); - searcher.close(); - reader.close(); - directory.close(); - } - - public void testToString() throws Exception { - StopAnalyzer analyzer = new StopAnalyzer(TEST_VERSION_CURRENT); - QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", analyzer); - qp.setEnablePositionIncrements(true); - PhraseQuery q = (PhraseQuery)qp.parse("\"this hi this is a test is\""); - assertEquals("field:\"? hi ? ? ? test\"", q.toString()); - q.add(new Term("field", "hello"), 1); - assertEquals("field:\"? hi|hello ? ? ? test\"", q.toString()); - } - - public void testWrappedPhrase() throws IOException { - query.add(new Term("repeated", "first")); - query.add(new Term("repeated", "part")); - query.add(new Term("repeated", "second")); - query.add(new Term("repeated", "part")); - query.setSlop(100); - - ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("slop of 100 just right", 1, hits.length); - QueryUtils.check(random, query,searcher); - - query.setSlop(99); - - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("slop of 99 not enough", 0, hits.length); - QueryUtils.check(random, query,searcher); - } - - // work on two docs like this: "phrase exist notexist exist found" - public void testNonExistingPhrase() throws IOException { - // phrase without repetitions that exists in 2 docs - query.add(new Term("nonexist", "phrase")); - query.add(new Term("nonexist", "notexist")); - query.add(new Term("nonexist", "found")); - query.setSlop(2); // would be found this way - - ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("phrase without repetitions exists in 2 docs", 2, hits.length); - QueryUtils.check(random, query,searcher); - - // phrase with repetitions that exists in 2 docs - query = new PhraseQuery(); - query.add(new Term("nonexist", "phrase")); - query.add(new Term("nonexist", "exist")); - query.add(new Term("nonexist", "exist")); - query.setSlop(1); // would be found - - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("phrase with repetitions exists in two docs", 2, hits.length); - QueryUtils.check(random, query,searcher); - - // phrase I with repetitions that does not exist in any doc - query = new PhraseQuery(); - query.add(new Term("nonexist", "phrase")); - query.add(new Term("nonexist", "notexist")); - query.add(new Term("nonexist", "phrase")); - query.setSlop(1000); // would not be found no matter how high the slop is - - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("nonexisting phrase with repetitions does not exist in any doc", 0, hits.length); - QueryUtils.check(random, query,searcher); - - // phrase II with repetitions that does not exist in any doc - query = new PhraseQuery(); - query.add(new Term("nonexist", "phrase")); - query.add(new Term("nonexist", "exist")); - query.add(new Term("nonexist", "exist")); - query.add(new Term("nonexist", "exist")); - query.setSlop(1000); // would not be found no matter how high the slop is - - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("nonexisting phrase with repetitions does not exist in any doc", 0, hits.length); - QueryUtils.check(random, query,searcher); - - } - - /** - * Working on a 2 fields like this: - * Field("field", "one two three four five") - * Field("palindrome", "one two three two one") - * Phrase of size 2 occuriong twice, once in order and once in reverse, - * because doc is a palyndrome, is counted twice. - * Also, in this case order in query does not matter. - * Also, when an exact match is found, both sloppy scorer and exact scorer scores the same. - */ - public void testPalyndrome2() throws Exception { - - // search on non palyndrome, find phrase with no slop, using exact phrase scorer - query.setSlop(0); // to use exact phrase scorer - query.add(new Term("field", "two")); - query.add(new Term("field", "three")); - ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("phrase found with exact phrase scorer", 1, hits.length); - float score0 = hits[0].score; - //System.out.println("(exact) field: two three: "+score0); - QueryUtils.check(random, query,searcher); - - // search on non palyndrome, find phrase with slop 2, though no slop required here. - query.setSlop(2); // to use sloppy scorer - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("just sloppy enough", 1, hits.length); - float score1 = hits[0].score; - //System.out.println("(sloppy) field: two three: "+score1); - assertEquals("exact scorer and sloppy scorer score the same when slop does not matter",score0, score1, SCORE_COMP_THRESH); - QueryUtils.check(random, query,searcher); - - // search ordered in palyndrome, find it twice - query = new PhraseQuery(); - query.setSlop(2); // must be at least two for both ordered and reversed to match - query.add(new Term("palindrome", "two")); - query.add(new Term("palindrome", "three")); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals("just sloppy enough", 1, hits.length); - //float score2 = hits[0].score; - //System.out.println("palindrome: two three: "+score2); - QueryUtils.check(random, query,searcher); - - //commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq(). - //assertTrue("ordered scores higher in palindrome",score1+SCORE_COMP_THRESH> docs = new ArrayList>(); - Document d = new Document(); - Field f = newField("f", "", Field.Store.NO, Field.Index.ANALYZED); - d.add(f); - - Random r = random; - - int NUM_DOCS = atLeast(10); - for (int i = 0; i < NUM_DOCS; i++) { - // must be > 4096 so it spans multiple chunks - int termCount = _TestUtil.nextInt(random, 4097, 8200); - - List doc = new ArrayList(); - - StringBuilder sb = new StringBuilder(); - while(doc.size() < termCount) { - if (r.nextInt(5) == 1 || docs.size() == 0) { - // make new non-empty-string term - String term; - while(true) { - term = _TestUtil.randomUnicodeString(r); - if (term.length() > 0) { - break; - } - } - TokenStream ts = analyzer.reusableTokenStream("ignore", new StringReader(term)); - CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class); - ts.reset(); - while(ts.incrementToken()) { - String text = termAttr.toString(); - doc.add(text); - sb.append(text).append(' '); - } - ts.end(); - ts.close(); - } else { - // pick existing sub-phrase - List lastDoc = docs.get(r.nextInt(docs.size())); - int len = _TestUtil.nextInt(r, 1, 10); - int start = r.nextInt(lastDoc.size()-len); - for(int k=start;k doc = docs.get(docID); - - final int numTerm = _TestUtil.nextInt(r, 2, 20); - final int start = r.nextInt(doc.size()-numTerm); - PhraseQuery pq = new PhraseQuery(); - StringBuilder sb = new StringBuilder(); - for(int t=start;t