1 package org.apache.lucene.search;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.util.ArrayList;
21 import java.util.Arrays;
22 import java.util.List;
24 import org.apache.lucene.document.Document;
25 import org.apache.lucene.document.Field;
26 import org.apache.lucene.document.NumericField;
27 import org.apache.lucene.index.IndexReader;
28 import org.apache.lucene.index.RandomIndexWriter;
29 import org.apache.lucene.index.Term;
30 import org.apache.lucene.search.BooleanClause.Occur;
31 import org.apache.lucene.search.grouping.GroupDocs;
32 import org.apache.lucene.search.grouping.TopGroups;
33 import org.apache.lucene.search.join.BlockJoinCollector;
34 import org.apache.lucene.search.join.BlockJoinQuery;
35 import org.apache.lucene.store.Directory;
36 import org.apache.lucene.util.BytesRef;
37 import org.apache.lucene.util.LuceneTestCase;
38 import org.apache.lucene.util._TestUtil;
40 public class TestBlockJoin extends LuceneTestCase {
43 private Document makeResume(String name, String country) {
44 Document resume = new Document();
45 resume.add(newField("docType", "resume", Field.Index.NOT_ANALYZED));
46 resume.add(newField("name", name, Field.Store.YES, Field.Index.NOT_ANALYZED));
47 resume.add(newField("country", country, Field.Index.NOT_ANALYZED));
51 // ... has multiple jobs
52 private Document makeJob(String skill, int year) {
53 Document job = new Document();
54 job.add(newField("skill", skill, Field.Store.YES, Field.Index.NOT_ANALYZED));
55 job.add(new NumericField("year").setIntValue(year));
59 public void testSimple() throws Exception {
61 final Directory dir = newDirectory();
62 final RandomIndexWriter w = new RandomIndexWriter(random, dir);
64 final List<Document> docs = new ArrayList<Document>();
66 docs.add(makeJob("java", 2007));
67 docs.add(makeJob("python", 2010));
68 docs.add(makeResume("Lisa", "United Kingdom"));
72 docs.add(makeJob("ruby", 2005));
73 docs.add(makeJob("java", 2006));
74 docs.add(makeResume("Frank", "United States"));
77 IndexReader r = w.getReader();
79 IndexSearcher s = new IndexSearcher(r);
81 // Create a filter that defines "parent" documents in the index - in this case resumes
82 Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("docType", "resume"))));
84 // Define child document criteria (finds an example of relevant work experience)
85 BooleanQuery childQuery = new BooleanQuery();
86 childQuery.add(new BooleanClause(new TermQuery(new Term("skill", "java")), Occur.MUST));
87 childQuery.add(new BooleanClause(NumericRangeQuery.newIntRange("year", 2006, 2011, true, true), Occur.MUST));
89 // Define parent document criteria (find a resident in the UK)
90 Query parentQuery = new TermQuery(new Term("country", "United Kingdom"));
92 // Wrap the child document query to 'join' any matches
93 // up to corresponding parent:
94 BlockJoinQuery childJoinQuery = new BlockJoinQuery(childQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
96 // Combine the parent and nested child queries into a single query for a candidate
97 BooleanQuery fullQuery = new BooleanQuery();
98 fullQuery.add(new BooleanClause(parentQuery, Occur.MUST));
99 fullQuery.add(new BooleanClause(childJoinQuery, Occur.MUST));
101 BlockJoinCollector c = new BlockJoinCollector(Sort.RELEVANCE, 1, true, false);
103 s.search(fullQuery, c);
105 TopGroups<Integer> results = c.getTopGroups(childJoinQuery, null, 0, 10, 0, true);
107 //assertEquals(1, results.totalHitCount);
108 assertEquals(1, results.totalGroupedHitCount);
109 assertEquals(1, results.groups.length);
111 final GroupDocs<Integer> group = results.groups[0];
112 assertEquals(1, group.totalHits);
114 Document childDoc = s.doc(group.scoreDocs[0].doc);
115 //System.out.println(" doc=" + group.scoreDocs[0].doc);
116 assertEquals("java", childDoc.get("skill"));
117 assertNotNull(group.groupValue);
118 Document parentDoc = s.doc(group.groupValue);
119 assertEquals("Lisa", parentDoc.get("name"));
125 private String[][] getRandomFields(int maxUniqueValues) {
127 final String[][] fields = new String[_TestUtil.nextInt(random, 2, 4)][];
128 for(int fieldID=0;fieldID<fields.length;fieldID++) {
129 final int valueCount;
133 valueCount = _TestUtil.nextInt(random, 1, maxUniqueValues);
136 final String[] values = fields[fieldID] = new String[valueCount];
137 for(int i=0;i<valueCount;i++) {
138 values[i] = _TestUtil.randomRealisticUnicodeString(random);
139 //values[i] = _TestUtil.randomSimpleString(random);
146 private Term randomParentTerm(String[] values) {
147 return new Term("parent0", values[random.nextInt(values.length)]);
150 private Term randomChildTerm(String[] values) {
151 return new Term("child0", values[random.nextInt(values.length)]);
154 private Sort getRandomSort(String prefix, int numFields) {
155 final List<SortField> sortFields = new ArrayList<SortField>();
156 // TODO: sometimes sort by score; problem is scores are
157 // not comparable across the two indices
158 // sortFields.add(SortField.FIELD_SCORE);
159 if (random.nextBoolean()) {
160 sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.STRING, random.nextBoolean()));
161 } else if (random.nextBoolean()) {
162 sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.STRING, random.nextBoolean()));
163 sortFields.add(new SortField(prefix + random.nextInt(numFields), SortField.STRING, random.nextBoolean()));
166 sortFields.add(new SortField(prefix + "ID", SortField.INT));
167 return new Sort(sortFields.toArray(new SortField[sortFields.size()]));
170 public void testRandom() throws Exception {
171 // We build two indices at once: one normalized (which
172 // BlockJoinQuery/Collector can query) and the other w/
173 // same docs just fully denormalized:
174 final Directory dir = newDirectory();
175 final Directory joinDir = newDirectory();
177 final int numParentDocs = _TestUtil.nextInt(random, 100*RANDOM_MULTIPLIER, 300*RANDOM_MULTIPLIER);
178 //final int numParentDocs = 30;
180 // Values for parent fields:
181 final String[][] parentFields = getRandomFields(numParentDocs/2);
182 // Values for child fields:
183 final String[][] childFields = getRandomFields(numParentDocs);
185 // TODO: test star join, nested join cases too!
186 final RandomIndexWriter w = new RandomIndexWriter(random, dir);
187 final RandomIndexWriter joinW = new RandomIndexWriter(random, joinDir);
188 for(int parentDocID=0;parentDocID<numParentDocs;parentDocID++) {
189 Document parentDoc = new Document();
190 Document parentJoinDoc = new Document();
191 Field id = newField("parentID", ""+parentDocID, Field.Store.YES, Field.Index.NOT_ANALYZED);
193 parentJoinDoc.add(id);
194 parentJoinDoc.add(newField("isParent", "x", Field.Index.NOT_ANALYZED));
195 for(int field=0;field<parentFields.length;field++) {
196 if (random.nextDouble() < 0.9) {
197 Field f = newField("parent" + field,
198 parentFields[field][random.nextInt(parentFields[field].length)],
199 Field.Index.NOT_ANALYZED);
201 parentJoinDoc.add(f);
205 final List<Document> joinDocs = new ArrayList<Document>();
208 System.out.println(" " + parentDoc);
211 final int numChildDocs = _TestUtil.nextInt(random, 1, 20);
212 for(int childDocID=0;childDocID<numChildDocs;childDocID++) {
213 // Denormalize: copy all parent fields into child doc:
214 Document childDoc = _TestUtil.cloneDocument(parentDoc);
215 Document joinChildDoc = new Document();
216 joinDocs.add(joinChildDoc);
218 Field childID = newField("childID", ""+childDocID, Field.Store.YES, Field.Index.NOT_ANALYZED);
219 childDoc.add(childID);
220 joinChildDoc.add(childID);
222 for(int childFieldID=0;childFieldID<childFields.length;childFieldID++) {
223 if (random.nextDouble() < 0.9) {
224 Field f = newField("child" + childFieldID,
225 childFields[childFieldID][random.nextInt(childFields[childFieldID].length)],
226 Field.Index.NOT_ANALYZED);
233 System.out.println(" " + joinChildDoc);
236 w.addDocument(childDoc);
240 joinDocs.add(parentJoinDoc);
241 joinW.addDocuments(joinDocs);
244 final IndexReader r = w.getReader();
246 final IndexReader joinR = joinW.getReader();
250 System.out.println("TEST: reader=" + r);
251 System.out.println("TEST: joinReader=" + joinR);
253 for(int docIDX=0;docIDX<joinR.maxDoc();docIDX++) {
254 System.out.println(" docID=" + docIDX + " doc=" + joinR.document(docIDX));
258 final IndexSearcher s = new IndexSearcher(r);
259 s.setDefaultFieldSortScoring(true, true);
261 final IndexSearcher joinS = new IndexSearcher(joinR);
263 final Filter parentsFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("isParent", "x"))));
265 final int iters = 200*RANDOM_MULTIPLIER;
267 for(int iter=0;iter<iters;iter++) {
269 System.out.println("TEST: iter=" + (1+iter) + " of " + iters);
272 final Query childQuery;
273 if (random.nextInt(3) == 2) {
274 final int childFieldID = random.nextInt(childFields.length);
275 childQuery = new TermQuery(new Term("child" + childFieldID,
276 childFields[childFieldID][random.nextInt(childFields[childFieldID].length)]));
277 } else if (random.nextInt(3) == 2) {
278 BooleanQuery bq = new BooleanQuery();
280 final int numClauses = _TestUtil.nextInt(random, 2, 4);
281 boolean didMust = false;
282 for(int clauseIDX=0;clauseIDX<numClauses;clauseIDX++) {
284 BooleanClause.Occur occur;
285 if (!didMust && random.nextBoolean()) {
286 occur = random.nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.MUST_NOT;
287 clause = new TermQuery(randomChildTerm(childFields[0]));
290 occur = BooleanClause.Occur.SHOULD;
291 final int childFieldID = _TestUtil.nextInt(random, 1, childFields.length-1);
292 clause = new TermQuery(new Term("child" + childFieldID,
293 childFields[childFieldID][random.nextInt(childFields[childFieldID].length)]));
295 bq.add(clause, occur);
298 BooleanQuery bq = new BooleanQuery();
301 bq.add(new TermQuery(randomChildTerm(childFields[0])),
302 BooleanClause.Occur.MUST);
303 final int childFieldID = _TestUtil.nextInt(random, 1, childFields.length-1);
304 bq.add(new TermQuery(new Term("child" + childFieldID, childFields[childFieldID][random.nextInt(childFields[childFieldID].length)])),
305 random.nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.MUST_NOT);
308 final BlockJoinQuery childJoinQuery = new BlockJoinQuery(childQuery, parentsFilter, BlockJoinQuery.ScoreMode.Avg);
310 // To run against the block-join index:
311 final Query parentJoinQuery;
313 // Same query as parentJoinQuery, but to run against
314 // the fully denormalized index (so we can compare)
316 final Query parentQuery;
318 if (random.nextBoolean()) {
319 parentQuery = childQuery;
320 parentJoinQuery = childJoinQuery;
322 // AND parent field w/ child field
323 final BooleanQuery bq = new BooleanQuery();
324 parentJoinQuery = bq;
325 final Term parentTerm = randomParentTerm(parentFields[0]);
326 if (random.nextBoolean()) {
327 bq.add(childJoinQuery, BooleanClause.Occur.MUST);
328 bq.add(new TermQuery(parentTerm),
329 BooleanClause.Occur.MUST);
331 bq.add(new TermQuery(parentTerm),
332 BooleanClause.Occur.MUST);
333 bq.add(childJoinQuery, BooleanClause.Occur.MUST);
336 final BooleanQuery bq2 = new BooleanQuery();
338 if (random.nextBoolean()) {
339 bq2.add(childQuery, BooleanClause.Occur.MUST);
340 bq2.add(new TermQuery(parentTerm),
341 BooleanClause.Occur.MUST);
343 bq2.add(new TermQuery(parentTerm),
344 BooleanClause.Occur.MUST);
345 bq2.add(childQuery, BooleanClause.Occur.MUST);
349 final Sort parentSort = getRandomSort("parent", parentFields.length);
350 final Sort childSort = getRandomSort("child", childFields.length);
353 System.out.println("\nTEST: query=" + parentQuery + " joinQuery=" + parentJoinQuery + " parentSort=" + parentSort + " childSort=" + childSort);
357 final List<SortField> sortFields = new ArrayList<SortField>(Arrays.asList(parentSort.getSort()));
358 sortFields.addAll(Arrays.asList(childSort.getSort()));
359 final Sort parentAndChildSort = new Sort(sortFields.toArray(new SortField[sortFields.size()]));
361 final TopDocs results = s.search(parentQuery, null, r.numDocs(),
365 System.out.println("\nTEST: normal index gets " + results.totalHits + " hits");
366 final ScoreDoc[] hits = results.scoreDocs;
367 for(int hitIDX=0;hitIDX<hits.length;hitIDX++) {
368 final Document doc = s.doc(hits[hitIDX].doc);
369 //System.out.println(" score=" + hits[hitIDX].score + " parentID=" + doc.get("parentID") + " childID=" + doc.get("childID") + " (docID=" + hits[hitIDX].doc + ")");
370 System.out.println(" parentID=" + doc.get("parentID") + " childID=" + doc.get("childID") + " (docID=" + hits[hitIDX].doc + ")");
371 FieldDoc fd = (FieldDoc) hits[hitIDX];
372 if (fd.fields != null) {
373 System.out.print(" ");
374 for(Object o : fd.fields) {
375 if (o instanceof BytesRef) {
376 System.out.print(((BytesRef) o).utf8ToString() + " ");
378 System.out.print(o + " ");
381 System.out.println();
386 final BlockJoinCollector c = new BlockJoinCollector(parentSort, 10, true, true);
388 joinS.search(parentJoinQuery, c);
390 final int hitsPerGroup = _TestUtil.nextInt(random, 1, 20);
391 //final int hitsPerGroup = 100;
392 final TopGroups<Integer> joinResults = c.getTopGroups(childJoinQuery, childSort, 0, hitsPerGroup, 0, true);
395 System.out.println("\nTEST: block join index gets " + (joinResults == null ? 0 : joinResults.groups.length) + " groups; hitsPerGroup=" + hitsPerGroup);
396 if (joinResults != null) {
397 final GroupDocs<Integer>[] groups = joinResults.groups;
398 for(int groupIDX=0;groupIDX<groups.length;groupIDX++) {
399 final GroupDocs<Integer> group = groups[groupIDX];
400 if (group.groupSortValues != null) {
401 System.out.print(" ");
402 for(Object o : group.groupSortValues) {
403 if (o instanceof BytesRef) {
404 System.out.print(((BytesRef) o).utf8ToString() + " ");
406 System.out.print(o + " ");
409 System.out.println();
412 assertNotNull(group.groupValue);
413 final Document parentDoc = joinS.doc(group.groupValue);
414 System.out.println(" group parentID=" + parentDoc.get("parentID") + " (docID=" + group.groupValue + ")");
415 for(int hitIDX=0;hitIDX<group.scoreDocs.length;hitIDX++) {
416 final Document doc = joinS.doc(group.scoreDocs[hitIDX].doc);
417 //System.out.println(" score=" + group.scoreDocs[hitIDX].score + " childID=" + doc.get("childID") + " (docID=" + group.scoreDocs[hitIDX].doc + ")");
418 System.out.println(" childID=" + doc.get("childID") + " child0=" + doc.get("child0") + " (docID=" + group.scoreDocs[hitIDX].doc + ")");
424 if (results.totalHits == 0) {
425 assertNull(joinResults);
427 compareHits(r, joinR, results, joinResults);
437 private void compareHits(IndexReader r, IndexReader joinR, TopDocs results, TopGroups<Integer> joinResults) throws Exception {
438 // results is 'complete'; joinResults is a subset
440 int joinGroupUpto = 0;
442 final ScoreDoc[] hits = results.scoreDocs;
443 final GroupDocs<Integer>[] groupDocs = joinResults.groups;
445 while(joinGroupUpto < groupDocs.length) {
446 final GroupDocs<Integer> group = groupDocs[joinGroupUpto++];
447 final ScoreDoc[] groupHits = group.scoreDocs;
448 assertNotNull(group.groupValue);
449 final Document parentDoc = joinR.document(group.groupValue);
450 final String parentID = parentDoc.get("parentID");
451 //System.out.println("GROUP groupDoc=" + group.groupDoc + " parent=" + parentDoc);
452 assertNotNull(parentID);
453 assertTrue(groupHits.length > 0);
454 for(int hitIDX=0;hitIDX<groupHits.length;hitIDX++) {
455 final Document nonJoinHit = r.document(hits[resultUpto++].doc);
456 final Document joinHit = joinR.document(groupHits[hitIDX].doc);
457 assertEquals(parentID,
458 nonJoinHit.get("parentID"));
459 assertEquals(joinHit.get("childID"),
460 nonJoinHit.get("childID"));
463 if (joinGroupUpto < groupDocs.length) {
464 // Advance non-join hit to the next parentID:
465 //System.out.println(" next joingroupUpto=" + joinGroupUpto + " gd.length=" + groupDocs.length + " parentID=" + parentID);
467 assertTrue(resultUpto < hits.length);
468 if (!parentID.equals(r.document(hits[resultUpto].doc).get("parentID"))) {