1 package org.apache.lucene.search;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.util.ArrayList;
21 import java.util.Arrays;
22 import java.util.List;
24 import org.apache.lucene.document.Document;
25 import org.apache.lucene.document.Field;
26 import org.apache.lucene.document.NumericField;
27 import org.apache.lucene.index.IndexReader;
28 import org.apache.lucene.index.RandomIndexWriter;
29 import org.apache.lucene.index.Term;
30 import org.apache.lucene.search.BooleanClause.Occur;
31 import org.apache.lucene.search.grouping.GroupDocs;
32 import org.apache.lucene.search.grouping.TopGroups;
33 import org.apache.lucene.search.join.BlockJoinCollector;
34 import org.apache.lucene.search.join.BlockJoinQuery;
35 import org.apache.lucene.store.Directory;
36 import org.apache.lucene.util.BytesRef;
37 import org.apache.lucene.util.LuceneTestCase;
38 import org.apache.lucene.util._TestUtil;
40 public class TestBlockJoin extends LuceneTestCase {
43 private Document makeResume(String name, String country) {
44 Document resume = new Document();
45 resume.add(newField("docType", "resume", Field.Index.NOT_ANALYZED));
46 resume.add(newField("name", name, Field.Store.YES, Field.Index.NOT_ANALYZED));
47 resume.add(newField("country", country, Field.Index.NOT_ANALYZED));
51 // ... has multiple jobs
52 private Document makeJob(String skill, int year) {
53 Document job = new Document();
54 job.add(newField("skill", skill, Field.Store.YES, Field.Index.NOT_ANALYZED));
55 job.add(new NumericField("year").setIntValue(year));
59 // ... has multiple qualifications
60 private Document makeQualification(String qualification, int year) {
61 Document job = new Document();
62 job.add(newField("qualification", qualification, Field.Store.YES, Field.Index.NOT_ANALYZED));
63 job.add(new NumericField("year").setIntValue(year));
67 public void testSimple() throws Exception {
69 final Directory dir = newDirectory();
70 final RandomIndexWriter w = new RandomIndexWriter(random, dir);
72 final List<Document> docs = new ArrayList<Document>();
74 docs.add(makeJob("java", 2007));
75 docs.add(makeJob("python", 2010));
76 docs.add(makeResume("Lisa", "United Kingdom"));
80 docs.add(makeJob("ruby", 2005));
81 docs.add(makeJob("java", 2006));
82 docs.add(makeResume("Frank", "United States"));
85 IndexReader r = w.getReader();
87 IndexSearcher s = new IndexSearcher(r);
89 // Create a filter that defines "parent" documents in the index - in this case resumes
90 Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
92 // Define child document criteria (finds an example of relevant work experience)
93 BooleanQuery childQuery = new BooleanQuery();
94 childQuery.add(new BooleanClause(new TermQuery(new Term("skill", "java")), Occur.MUST));
95 childQuery.add(new BooleanClause(NumericRangeQuery.newIntRange("year", 2006, 2011, true, true), Occur.MUST));
97 // Define parent document criteria (find a resident in the UK)
98 Query parentQuery = new TermQuery(new Term("country", "United Kingdom"));
100 // Wrap the child document query to 'join' any matches
101 // up to corresponding parent:
102 BlockJoinQuery childJoinQuery = new BlockJoinQuery(childQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
104 // Combine the parent and nested child queries into a single query for a candidate
105 BooleanQuery fullQuery = new BooleanQuery();
106 fullQuery.add(new BooleanClause(parentQuery, Occur.MUST));
107 fullQuery.add(new BooleanClause(childJoinQuery, Occur.MUST));
109 BlockJoinCollector c = new BlockJoinCollector(Sort.RELEVANCE, 1, true, false);
111 s.search(fullQuery, c);
113 TopGroups<Integer> results = c.getTopGroups(childJoinQuery, null, 0, 10, 0, true);
115 //assertEquals(1, results.totalHitCount);
116 assertEquals(1, results.totalGroupedHitCount);
117 assertEquals(1, results.groups.length);
119 final GroupDocs<Integer> group = results.groups[0];
120 assertEquals(1, group.totalHits);
122 Document childDoc = s.doc(group.scoreDocs[0].doc);
123 //System.out.println(" doc=" + group.scoreDocs[0].doc);
124 assertEquals("java", childDoc.get("skill"));
125 assertNotNull(group.groupValue);
126 Document parentDoc = s.doc(group.groupValue);
127 assertEquals("Lisa", parentDoc.get("name"));
133 public void testBoostBug() throws Exception {
134 final Directory dir = newDirectory();
135 final RandomIndexWriter w = new RandomIndexWriter(random, dir);
136 IndexReader r = w.getReader();
138 IndexSearcher s = newSearcher(r);
140 BlockJoinQuery q = new BlockJoinQuery(new MatchAllDocsQuery(), new QueryWrapperFilter(new MatchAllDocsQuery()), BlockJoinQuery.ScoreMode.Avg);
142 BooleanQuery bq = new BooleanQuery();
143 bq.setBoost(2f); // we boost the BQ
144 bq.add(q, BooleanClause.Occur.MUST);
151 private String[][] getRandomFields(int maxUniqueValues) {
153 final String[][] fields = new String[_TestUtil.nextInt(random, 2, 4)][];
154 for(int fieldID=0;fieldID<fields.length;fieldID++) {
155 final int valueCount;
159 valueCount = _TestUtil.nextInt(random, 1, maxUniqueValues);
162 final String[] values = fields[fieldID] = new String[valueCount];
163 for(int i=0;i<valueCount;i++) {
164 values[i] = _TestUtil.randomRealisticUnicodeString(random);
165 //values[i] = _TestUtil.randomSimpleString(random);
172 private Term randomParentTerm(String[] values) {
173 return new Term("parent0", values[random.nextInt(values.length)]);
176 private Term randomChildTerm(String[] values) {
177 return new Term("child0", values[random.nextInt(values.length)]);
180 private Sort getRandomSort(String prefix, int numFields) {
181 final List<SortField> sortFields = new ArrayList<SortField>();
182 // TODO: sometimes sort by score; problem is scores are
183 // not comparable across the two indices
184 // sortFields.add(SortField.FIELD_SCORE);
185 if (random.nextBoolean()) {
186 sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.STRING, random.nextBoolean()));
187 } else if (random.nextBoolean()) {
188 sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.STRING, random.nextBoolean()));
189 sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.STRING, random.nextBoolean()));
192 sortFields.add(new SortField(prefix + "ID", SortField.INT));
193 return new Sort(sortFields.toArray(new SortField[sortFields.size()]));
196 public void testRandom() throws Exception {
197 // We build two indices at once: one normalized (which
198 // BlockJoinQuery/Collector can query) and the other w/
199 // same docs just fully denormalized:
200 final Directory dir = newDirectory();
201 final Directory joinDir = newDirectory();
203 final int numParentDocs = _TestUtil.nextInt(random, 100*RANDOM_MULTIPLIER, 300*RANDOM_MULTIPLIER);
204 //final int numParentDocs = 30;
206 // Values for parent fields:
207 final String[][] parentFields = getRandomFields(numParentDocs/2);
208 // Values for child fields:
209 final String[][] childFields = getRandomFields(numParentDocs);
211 // TODO: test star join, nested join cases too!
212 final RandomIndexWriter w = new RandomIndexWriter(random, dir);
213 final RandomIndexWriter joinW = new RandomIndexWriter(random, joinDir);
214 for(int parentDocID=0;parentDocID<numParentDocs;parentDocID++) {
215 Document parentDoc = new Document();
216 Document parentJoinDoc = new Document();
217 Field id = newField("parentID", ""+parentDocID, Field.Store.YES, Field.Index.NOT_ANALYZED);
219 parentJoinDoc.add(id);
220 parentJoinDoc.add(newField("isParent", "x", Field.Index.NOT_ANALYZED));
221 for(int field=0;field<parentFields.length;field++) {
222 if (random.nextDouble() < 0.9) {
223 Field f = newField("parent" + field,
224 parentFields[field][random.nextInt(parentFields[field].length)],
225 Field.Index.NOT_ANALYZED);
227 parentJoinDoc.add(f);
231 final List<Document> joinDocs = new ArrayList<Document>();
234 System.out.println(" " + parentDoc);
237 final int numChildDocs = _TestUtil.nextInt(random, 1, 20);
238 for(int childDocID=0;childDocID<numChildDocs;childDocID++) {
239 // Denormalize: copy all parent fields into child doc:
240 Document childDoc = _TestUtil.cloneDocument(parentDoc);
241 Document joinChildDoc = new Document();
242 joinDocs.add(joinChildDoc);
244 Field childID = newField("childID", ""+childDocID, Field.Store.YES, Field.Index.NOT_ANALYZED);
245 childDoc.add(childID);
246 joinChildDoc.add(childID);
248 for(int childFieldID=0;childFieldID<childFields.length;childFieldID++) {
249 if (random.nextDouble() < 0.9) {
250 Field f = newField("child" + childFieldID,
251 childFields[childFieldID][random.nextInt(childFields[childFieldID].length)],
252 Field.Index.NOT_ANALYZED);
259 System.out.println(" " + joinChildDoc);
262 w.addDocument(childDoc);
266 joinDocs.add(parentJoinDoc);
267 joinW.addDocuments(joinDocs);
270 final IndexReader r = w.getReader();
272 final IndexReader joinR = joinW.getReader();
276 System.out.println("TEST: reader=" + r);
277 System.out.println("TEST: joinReader=" + joinR);
279 for(int docIDX=0;docIDX<joinR.maxDoc();docIDX++) {
280 System.out.println(" docID=" + docIDX + " doc=" + joinR.document(docIDX));
284 final IndexSearcher s = new IndexSearcher(r);
285 s.setDefaultFieldSortScoring(true, true);
287 final IndexSearcher joinS = new IndexSearcher(joinR);
289 final Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("isParent", "x"))));
291 final int iters = 200*RANDOM_MULTIPLIER;
293 for(int iter=0;iter<iters;iter++) {
295 System.out.println("TEST: iter=" + (1+iter) + " of " + iters);
298 final Query childQuery;
299 if (random.nextInt(3) == 2) {
300 final int childFieldID = random.nextInt(childFields.length);
301 childQuery = new TermQuery(new Term("child" + childFieldID,
302 childFields[childFieldID][random.nextInt(childFields[childFieldID].length)]));
303 } else if (random.nextInt(3) == 2) {
304 BooleanQuery bq = new BooleanQuery();
306 final int numClauses = _TestUtil.nextInt(random, 2, 4);
307 boolean didMust = false;
308 for(int clauseIDX=0;clauseIDX<numClauses;clauseIDX++) {
310 BooleanClause.Occur occur;
311 if (!didMust && random.nextBoolean()) {
312 occur = random.nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.MUST_NOT;
313 clause = new TermQuery(randomChildTerm(childFields[0]));
316 occur = BooleanClause.Occur.SHOULD;
317 final int childFieldID = _TestUtil.nextInt(random, 1, childFields.length-1);
318 clause = new TermQuery(new Term("child" + childFieldID,
319 childFields[childFieldID][random.nextInt(childFields[childFieldID].length)]));
321 bq.add(clause, occur);
324 BooleanQuery bq = new BooleanQuery();
327 bq.add(new TermQuery(randomChildTerm(childFields[0])),
328 BooleanClause.Occur.MUST);
329 final int childFieldID = _TestUtil.nextInt(random, 1, childFields.length-1);
330 bq.add(new TermQuery(new Term("child" + childFieldID, childFields[childFieldID][random.nextInt(childFields[childFieldID].length)])),
331 random.nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.MUST_NOT);
334 final BlockJoinQuery childJoinQuery = new BlockJoinQuery(childQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
336 // To run against the block-join index:
337 final Query parentJoinQuery;
339 // Same query as parentJoinQuery, but to run against
340 // the fully denormalized index (so we can compare)
342 final Query parentQuery;
344 if (random.nextBoolean()) {
345 parentQuery = childQuery;
346 parentJoinQuery = childJoinQuery;
348 // AND parent field w/ child field
349 final BooleanQuery bq = new BooleanQuery();
350 parentJoinQuery = bq;
351 final Term parentTerm = randomParentTerm(parentFields[0]);
352 if (random.nextBoolean()) {
353 bq.add(childJoinQuery, BooleanClause.Occur.MUST);
354 bq.add(new TermQuery(parentTerm),
355 BooleanClause.Occur.MUST);
357 bq.add(new TermQuery(parentTerm),
358 BooleanClause.Occur.MUST);
359 bq.add(childJoinQuery, BooleanClause.Occur.MUST);
362 final BooleanQuery bq2 = new BooleanQuery();
364 if (random.nextBoolean()) {
365 bq2.add(childQuery, BooleanClause.Occur.MUST);
366 bq2.add(new TermQuery(parentTerm),
367 BooleanClause.Occur.MUST);
369 bq2.add(new TermQuery(parentTerm),
370 BooleanClause.Occur.MUST);
371 bq2.add(childQuery, BooleanClause.Occur.MUST);
375 final Sort parentSort = getRandomSort("parent", parentFields.length);
376 final Sort childSort = getRandomSort("child", childFields.length);
379 System.out.println("\nTEST: query=" + parentQuery + " joinQuery=" + parentJoinQuery + " parentSort=" + parentSort + " childSort=" + childSort);
383 final List<SortField> sortFields = new ArrayList<SortField>(Arrays.asList(parentSort.getSort()));
384 sortFields.addAll(Arrays.asList(childSort.getSort()));
385 final Sort parentAndChildSort = new Sort(sortFields.toArray(new SortField[sortFields.size()]));
387 final TopDocs results = s.search(parentQuery, null, r.numDocs(),
391 System.out.println("\nTEST: normal index gets " + results.totalHits + " hits");
392 final ScoreDoc[] hits = results.scoreDocs;
393 for(int hitIDX=0;hitIDX<hits.length;hitIDX++) {
394 final Document doc = s.doc(hits[hitIDX].doc);
395 //System.out.println(" score=" + hits[hitIDX].score + " parentID=" + doc.get("parentID") + " childID=" + doc.get("childID") + " (docID=" + hits[hitIDX].doc + ")");
396 System.out.println(" parentID=" + doc.get("parentID") + " childID=" + doc.get("childID") + " (docID=" + hits[hitIDX].doc + ")");
397 FieldDoc fd = (FieldDoc) hits[hitIDX];
398 if (fd.fields != null) {
399 System.out.print(" ");
400 for(Object o : fd.fields) {
401 if (o instanceof BytesRef) {
402 System.out.print(((BytesRef) o).utf8ToString() + " ");
404 System.out.print(o + " ");
407 System.out.println();
412 final BlockJoinCollector c = new BlockJoinCollector(parentSort, 10, true, true);
414 joinS.search(parentJoinQuery, c);
416 final int hitsPerGroup = _TestUtil.nextInt(random, 1, 20);
417 //final int hitsPerGroup = 100;
418 final TopGroups<Integer> joinResults = c.getTopGroups(childJoinQuery, childSort, 0, hitsPerGroup, 0, true);
421 System.out.println("\nTEST: block join index gets " + (joinResults == null ? 0 : joinResults.groups.length) + " groups; hitsPerGroup=" + hitsPerGroup);
422 if (joinResults != null) {
423 final GroupDocs<Integer>[] groups = joinResults.groups;
424 for(int groupIDX=0;groupIDX<groups.length;groupIDX++) {
425 final GroupDocs<Integer> group = groups[groupIDX];
426 if (group.groupSortValues != null) {
427 System.out.print(" ");
428 for(Object o : group.groupSortValues) {
429 if (o instanceof BytesRef) {
430 System.out.print(((BytesRef) o).utf8ToString() + " ");
432 System.out.print(o + " ");
435 System.out.println();
438 assertNotNull(group.groupValue);
439 final Document parentDoc = joinS.doc(group.groupValue);
440 System.out.println(" group parentID=" + parentDoc.get("parentID") + " (docID=" + group.groupValue + ")");
441 for(int hitIDX=0;hitIDX<group.scoreDocs.length;hitIDX++) {
442 final Document doc = joinS.doc(group.scoreDocs[hitIDX].doc);
443 //System.out.println(" score=" + group.scoreDocs[hitIDX].score + " childID=" + doc.get("childID") + " (docID=" + group.scoreDocs[hitIDX].doc + ")");
444 System.out.println(" childID=" + doc.get("childID") + " child0=" + doc.get("child0") + " (docID=" + group.scoreDocs[hitIDX].doc + ")");
450 if (results.totalHits == 0) {
451 assertNull(joinResults);
453 compareHits(r, joinR, results, joinResults);
463 private void compareHits(IndexReader r, IndexReader joinR, TopDocs results, TopGroups<Integer> joinResults) throws Exception {
464 // results is 'complete'; joinResults is a subset
466 int joinGroupUpto = 0;
468 final ScoreDoc[] hits = results.scoreDocs;
469 final GroupDocs<Integer>[] groupDocs = joinResults.groups;
471 while(joinGroupUpto < groupDocs.length) {
472 final GroupDocs<Integer> group = groupDocs[joinGroupUpto++];
473 final ScoreDoc[] groupHits = group.scoreDocs;
474 assertNotNull(group.groupValue);
475 final Document parentDoc = joinR.document(group.groupValue);
476 final String parentID = parentDoc.get("parentID");
477 //System.out.println("GROUP groupDoc=" + group.groupDoc + " parent=" + parentDoc);
478 assertNotNull(parentID);
479 assertTrue(groupHits.length > 0);
480 for(int hitIDX=0;hitIDX<groupHits.length;hitIDX++) {
481 final Document nonJoinHit = r.document(hits[resultUpto++].doc);
482 final Document joinHit = joinR.document(groupHits[hitIDX].doc);
483 assertEquals(parentID,
484 nonJoinHit.get("parentID"));
485 assertEquals(joinHit.get("childID"),
486 nonJoinHit.get("childID"));
489 if (joinGroupUpto < groupDocs.length) {
490 // Advance non-join hit to the next parentID:
491 //System.out.println(" next joingroupUpto=" + joinGroupUpto + " gd.length=" + groupDocs.length + " parentID=" + parentID);
493 assertTrue(resultUpto < hits.length);
494 if (!parentID.equals(r.document(hits[resultUpto].doc).get("parentID"))) {
503 public void testMultiChildTypes() throws Exception {
505 final Directory dir = newDirectory();
506 final RandomIndexWriter w = new RandomIndexWriter(random, dir);
508 final List<Document> docs = new ArrayList<Document>();
510 docs.add(makeJob("java", 2007));
511 docs.add(makeJob("python", 2010));
512 docs.add(makeQualification("maths", 1999));
513 docs.add(makeResume("Lisa", "United Kingdom"));
514 w.addDocuments(docs);
516 IndexReader r = w.getReader();
518 IndexSearcher s = new IndexSearcher(r);
520 // Create a filter that defines "parent" documents in the index - in this case resumes
521 Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
523 // Define child document criteria (finds an example of relevant work experience)
524 BooleanQuery childJobQuery = new BooleanQuery();
525 childJobQuery.add(new BooleanClause(new TermQuery(new Term("skill", "java")), Occur.MUST));
526 childJobQuery.add(new BooleanClause(NumericRangeQuery.newIntRange("year", 2006, 2011, true, true), Occur.MUST));
528 BooleanQuery childQualificationQuery = new BooleanQuery();
529 childQualificationQuery.add(new BooleanClause(new TermQuery(new Term("qualification", "maths")), Occur.MUST));
530 childQualificationQuery.add(new BooleanClause(NumericRangeQuery.newIntRange("year", 1980, 2000, true, true), Occur.MUST));
533 // Define parent document criteria (find a resident in the UK)
534 Query parentQuery = new TermQuery(new Term("country", "United Kingdom"));
536 // Wrap the child document query to 'join' any matches
537 // up to corresponding parent:
538 BlockJoinQuery childJobJoinQuery = new BlockJoinQuery(childJobQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
539 BlockJoinQuery childQualificationJoinQuery = new BlockJoinQuery(childQualificationQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
541 // Combine the parent and nested child queries into a single query for a candidate
542 BooleanQuery fullQuery = new BooleanQuery();
543 fullQuery.add(new BooleanClause(parentQuery, Occur.MUST));
544 fullQuery.add(new BooleanClause(childJobJoinQuery, Occur.MUST));
545 fullQuery.add(new BooleanClause(childQualificationJoinQuery, Occur.MUST));
547 //????? How do I control volume of jobs vs qualifications per parent?
548 BlockJoinCollector c = new BlockJoinCollector(Sort.RELEVANCE, 10, true, false);
550 s.search(fullQuery, c);
552 //Examine "Job" children
553 boolean showNullPointerIssue=true;
554 if (showNullPointerIssue) {
555 TopGroups<Integer> jobResults = c.getTopGroups(childJobJoinQuery, null, 0, 10, 0, true);
557 //assertEquals(1, results.totalHitCount);
558 assertEquals(1, jobResults.totalGroupedHitCount);
559 assertEquals(1, jobResults.groups.length);
561 final GroupDocs<Integer> group = jobResults.groups[0];
562 assertEquals(1, group.totalHits);
564 Document childJobDoc = s.doc(group.scoreDocs[0].doc);
565 //System.out.println(" doc=" + group.scoreDocs[0].doc);
566 assertEquals("java", childJobDoc.get("skill"));
567 assertNotNull(group.groupValue);
568 Document parentDoc = s.doc(group.groupValue);
569 assertEquals("Lisa", parentDoc.get("name"));
572 //Now Examine qualification children
573 TopGroups<Integer> qualificationResults = c.getTopGroups(childQualificationJoinQuery, null, 0, 10, 0, true);
575 //!!!!! This next line can null pointer - but only if prior "jobs" section called first
576 assertEquals(1, qualificationResults.totalGroupedHitCount);
577 assertEquals(1, qualificationResults.groups.length);
579 final GroupDocs<Integer> qGroup = qualificationResults.groups[0];
580 assertEquals(1, qGroup.totalHits);
582 Document childQualificationDoc = s.doc(qGroup.scoreDocs[0].doc);
583 assertEquals("maths", childQualificationDoc.get("qualification"));
584 assertNotNull(qGroup.groupValue);
585 Document parentDoc = s.doc(qGroup.groupValue);
586 assertEquals("Lisa", parentDoc.get("name"));