X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/join/src/test/org/apache/lucene/search/TestBlockJoin.java diff --git a/lucene-java-3.5.0/lucene/contrib/join/src/test/org/apache/lucene/search/TestBlockJoin.java b/lucene-java-3.5.0/lucene/contrib/join/src/test/org/apache/lucene/search/TestBlockJoin.java new file mode 100644 index 0000000..27627a9 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/join/src/test/org/apache/lucene/search/TestBlockJoin.java @@ -0,0 +1,592 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.grouping.GroupDocs; +import org.apache.lucene.search.grouping.TopGroups; +import org.apache.lucene.search.join.BlockJoinCollector; +import org.apache.lucene.search.join.BlockJoinQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +public class TestBlockJoin extends LuceneTestCase { + + // One resume... + private Document makeResume(String name, String country) { + Document resume = new Document(); + resume.add(newField("docType", "resume", Field.Index.NOT_ANALYZED)); + resume.add(newField("name", name, Field.Store.YES, Field.Index.NOT_ANALYZED)); + resume.add(newField("country", country, Field.Index.NOT_ANALYZED)); + return resume; + } + + // ... has multiple jobs + private Document makeJob(String skill, int year) { + Document job = new Document(); + job.add(newField("skill", skill, Field.Store.YES, Field.Index.NOT_ANALYZED)); + job.add(new NumericField("year").setIntValue(year)); + return job; + } + + // ... has multiple qualifications + private Document makeQualification(String qualification, int year) { + Document job = new Document(); + job.add(newField("qualification", qualification, Field.Store.YES, Field.Index.NOT_ANALYZED)); + job.add(new NumericField("year").setIntValue(year)); + return job; + } + + public void testSimple() throws Exception { + + final Directory dir = newDirectory(); + final RandomIndexWriter w = new RandomIndexWriter(random, dir); + + final List docs = new ArrayList(); + + docs.add(makeJob("java", 2007)); + docs.add(makeJob("python", 2010)); + docs.add(makeResume("Lisa", "United Kingdom")); + w.addDocuments(docs); + + docs.clear(); + docs.add(makeJob("ruby", 2005)); + docs.add(makeJob("java", 2006)); + docs.add(makeResume("Frank", "United States")); + w.addDocuments(docs); + + IndexReader r = w.getReader(); + w.close(); + IndexSearcher s = new IndexSearcher(r); + + // Create a filter that defines "parent" documents in the index - in this case resumes + Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume")))); + + // Define child document criteria (finds an example of relevant work experience) + BooleanQuery childQuery = new BooleanQuery(); + childQuery.add(new BooleanClause(new TermQuery(new Term("skill", "java")), Occur.MUST)); + childQuery.add(new BooleanClause(NumericRangeQuery.newIntRange("year", 2006, 2011, true, true), Occur.MUST)); + + // Define parent document criteria (find a resident in the UK) + Query parentQuery = new TermQuery(new Term("country", "United Kingdom")); + + // Wrap the child document query to 'join' any matches + // up to corresponding parent: + BlockJoinQuery childJoinQuery = new BlockJoinQuery(childQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg); + + // Combine the parent and nested child queries into a single query for a candidate + BooleanQuery fullQuery = new BooleanQuery(); + fullQuery.add(new BooleanClause(parentQuery, Occur.MUST)); + fullQuery.add(new BooleanClause(childJoinQuery, Occur.MUST)); + + BlockJoinCollector c = new BlockJoinCollector(Sort.RELEVANCE, 1, true, false); + + s.search(fullQuery, c); + + TopGroups results = c.getTopGroups(childJoinQuery, null, 0, 10, 0, true); + + //assertEquals(1, results.totalHitCount); + assertEquals(1, results.totalGroupedHitCount); + assertEquals(1, results.groups.length); + + final GroupDocs group = results.groups[0]; + assertEquals(1, group.totalHits); + + Document childDoc = s.doc(group.scoreDocs[0].doc); + //System.out.println(" doc=" + group.scoreDocs[0].doc); + assertEquals("java", childDoc.get("skill")); + assertNotNull(group.groupValue); + Document parentDoc = s.doc(group.groupValue); + assertEquals("Lisa", parentDoc.get("name")); + + r.close(); + dir.close(); + } + + public void testBoostBug() throws Exception { + final Directory dir = newDirectory(); + final RandomIndexWriter w = new RandomIndexWriter(random, dir); + IndexReader r = w.getReader(); + w.close(); + IndexSearcher s = newSearcher(r); + + BlockJoinQuery q = new BlockJoinQuery(new MatchAllDocsQuery(), new QueryWrapperFilter(new MatchAllDocsQuery()), BlockJoinQuery.ScoreMode.Avg); + s.search(q, 10); + BooleanQuery bq = new BooleanQuery(); + bq.setBoost(2f); // we boost the BQ + bq.add(q, BooleanClause.Occur.MUST); + s.search(bq, 10); + s.close(); + r.close(); + dir.close(); + } + + private String[][] getRandomFields(int maxUniqueValues) { + + final String[][] fields = new String[_TestUtil.nextInt(random, 2, 4)][]; + for(int fieldID=0;fieldID sortFields = new ArrayList(); + // TODO: sometimes sort by score; problem is scores are + // not comparable across the two indices + // sortFields.add(SortField.FIELD_SCORE); + if (random.nextBoolean()) { + sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.STRING, random.nextBoolean())); + } else if (random.nextBoolean()) { + sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.STRING, random.nextBoolean())); + sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.STRING, random.nextBoolean())); + } + // Break ties: + sortFields.add(new SortField(prefix + "ID", SortField.INT)); + return new Sort(sortFields.toArray(new SortField[sortFields.size()])); + } + + public void testRandom() throws Exception { + // We build two indices at once: one normalized (which + // BlockJoinQuery/Collector can query) and the other w/ + // same docs just fully denormalized: + final Directory dir = newDirectory(); + final Directory joinDir = newDirectory(); + + final int numParentDocs = _TestUtil.nextInt(random, 100*RANDOM_MULTIPLIER, 300*RANDOM_MULTIPLIER); + //final int numParentDocs = 30; + + // Values for parent fields: + final String[][] parentFields = getRandomFields(numParentDocs/2); + // Values for child fields: + final String[][] childFields = getRandomFields(numParentDocs); + + // TODO: test star join, nested join cases too! + final RandomIndexWriter w = new RandomIndexWriter(random, dir); + final RandomIndexWriter joinW = new RandomIndexWriter(random, joinDir); + for(int parentDocID=0;parentDocID joinDocs = new ArrayList(); + + if (VERBOSE) { + System.out.println(" " + parentDoc); + } + + final int numChildDocs = _TestUtil.nextInt(random, 1, 20); + for(int childDocID=0;childDocID joinResults = c.getTopGroups(childJoinQuery, childSort, 0, hitsPerGroup, 0, true); + + if (VERBOSE) { + System.out.println("\nTEST: block join index gets " + (joinResults == null ? 0 : joinResults.groups.length) + " groups; hitsPerGroup=" + hitsPerGroup); + if (joinResults != null) { + final GroupDocs[] groups = joinResults.groups; + for(int groupIDX=0;groupIDX group = groups[groupIDX]; + if (group.groupSortValues != null) { + System.out.print(" "); + for(Object o : group.groupSortValues) { + if (o instanceof BytesRef) { + System.out.print(((BytesRef) o).utf8ToString() + " "); + } else { + System.out.print(o + " "); + } + } + System.out.println(); + } + + assertNotNull(group.groupValue); + final Document parentDoc = joinS.doc(group.groupValue); + System.out.println(" group parentID=" + parentDoc.get("parentID") + " (docID=" + group.groupValue + ")"); + for(int hitIDX=0;hitIDX joinResults) throws Exception { + // results is 'complete'; joinResults is a subset + int resultUpto = 0; + int joinGroupUpto = 0; + + final ScoreDoc[] hits = results.scoreDocs; + final GroupDocs[] groupDocs = joinResults.groups; + + while(joinGroupUpto < groupDocs.length) { + final GroupDocs group = groupDocs[joinGroupUpto++]; + final ScoreDoc[] groupHits = group.scoreDocs; + assertNotNull(group.groupValue); + final Document parentDoc = joinR.document(group.groupValue); + final String parentID = parentDoc.get("parentID"); + //System.out.println("GROUP groupDoc=" + group.groupDoc + " parent=" + parentDoc); + assertNotNull(parentID); + assertTrue(groupHits.length > 0); + for(int hitIDX=0;hitIDX docs = new ArrayList(); + + docs.add(makeJob("java", 2007)); + docs.add(makeJob("python", 2010)); + docs.add(makeQualification("maths", 1999)); + docs.add(makeResume("Lisa", "United Kingdom")); + w.addDocuments(docs); + + IndexReader r = w.getReader(); + w.close(); + IndexSearcher s = new IndexSearcher(r); + + // Create a filter that defines "parent" documents in the index - in this case resumes + Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume")))); + + // Define child document criteria (finds an example of relevant work experience) + BooleanQuery childJobQuery = new BooleanQuery(); + childJobQuery.add(new BooleanClause(new TermQuery(new Term("skill", "java")), Occur.MUST)); + childJobQuery.add(new BooleanClause(NumericRangeQuery.newIntRange("year", 2006, 2011, true, true), Occur.MUST)); + + BooleanQuery childQualificationQuery = new BooleanQuery(); + childQualificationQuery.add(new BooleanClause(new TermQuery(new Term("qualification", "maths")), Occur.MUST)); + childQualificationQuery.add(new BooleanClause(NumericRangeQuery.newIntRange("year", 1980, 2000, true, true), Occur.MUST)); + + + // Define parent document criteria (find a resident in the UK) + Query parentQuery = new TermQuery(new Term("country", "United Kingdom")); + + // Wrap the child document query to 'join' any matches + // up to corresponding parent: + BlockJoinQuery childJobJoinQuery = new BlockJoinQuery(childJobQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg); + BlockJoinQuery childQualificationJoinQuery = new BlockJoinQuery(childQualificationQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg); + + // Combine the parent and nested child queries into a single query for a candidate + BooleanQuery fullQuery = new BooleanQuery(); + fullQuery.add(new BooleanClause(parentQuery, Occur.MUST)); + fullQuery.add(new BooleanClause(childJobJoinQuery, Occur.MUST)); + fullQuery.add(new BooleanClause(childQualificationJoinQuery, Occur.MUST)); + + //????? How do I control volume of jobs vs qualifications per parent? + BlockJoinCollector c = new BlockJoinCollector(Sort.RELEVANCE, 10, true, false); + + s.search(fullQuery, c); + + //Examine "Job" children + boolean showNullPointerIssue=true; + if (showNullPointerIssue) { + TopGroups jobResults = c.getTopGroups(childJobJoinQuery, null, 0, 10, 0, true); + + //assertEquals(1, results.totalHitCount); + assertEquals(1, jobResults.totalGroupedHitCount); + assertEquals(1, jobResults.groups.length); + + final GroupDocs group = jobResults.groups[0]; + assertEquals(1, group.totalHits); + + Document childJobDoc = s.doc(group.scoreDocs[0].doc); + //System.out.println(" doc=" + group.scoreDocs[0].doc); + assertEquals("java", childJobDoc.get("skill")); + assertNotNull(group.groupValue); + Document parentDoc = s.doc(group.groupValue); + assertEquals("Lisa", parentDoc.get("name")); + } + + //Now Examine qualification children + TopGroups qualificationResults = c.getTopGroups(childQualificationJoinQuery, null, 0, 10, 0, true); + + //!!!!! This next line can null pointer - but only if prior "jobs" section called first + assertEquals(1, qualificationResults.totalGroupedHitCount); + assertEquals(1, qualificationResults.groups.length); + + final GroupDocs qGroup = qualificationResults.groups[0]; + assertEquals(1, qGroup.totalHits); + + Document childQualificationDoc = s.doc(qGroup.scoreDocs[0].doc); + assertEquals("maths", childQualificationDoc.get("qualification")); + assertNotNull(qGroup.groupValue); + Document parentDoc = s.doc(qGroup.groupValue); + assertEquals("Lisa", parentDoc.get("name")); + + + r.close(); + dir.close(); + } +}