X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TermAllGroupHeadsCollectorTest.java?ds=sidebyside diff --git a/lucene-java-3.5.0/lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TermAllGroupHeadsCollectorTest.java b/lucene-java-3.5.0/lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TermAllGroupHeadsCollectorTest.java new file mode 100644 index 0000000..326c50b --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TermAllGroupHeadsCollectorTest.java @@ -0,0 +1,490 @@ +package org.apache.lucene.search.grouping; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.*; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +import java.io.IOException; +import java.util.*; + +public class TermAllGroupHeadsCollectorTest extends LuceneTestCase { + + public void testBasic() throws Exception { + final String groupField = "author"; + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter( + random, + dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + + // 0 + Document doc = new Document(); + doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "random text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); + w.addDocument(doc); + + // 1 + doc = new Document(); + doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "some more random text blob", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); + w.addDocument(doc); + + // 2 + doc = new Document(); + doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "some more random textual data", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); + w.addDocument(doc); + w.commit(); // To ensure a second segment + + // 3 + doc = new Document(); + doc.add(new Field(groupField, "author2", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "some random text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); + w.addDocument(doc); + + // 4 + doc = new Document(); + doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "some more random text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); + w.addDocument(doc); + + // 5 + doc = new Document(); + doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "random blob", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); + w.addDocument(doc); + + // 6 -- no author field + doc = new Document(); + doc.add(new Field("content", "random word stuck in alot of other text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); + w.addDocument(doc); + + // 7 -- no author field + doc = new Document(); + doc.add(new Field("content", "random word stuck in alot of other text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "7", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); + w.addDocument(doc); + + IndexSearcher indexSearcher = new IndexSearcher(w.getReader()); + w.close(); + int maxDoc = indexSearcher.maxDoc(); + + Sort sortWithinGroup = new Sort(new SortField("id", SortField.INT, true)); + AbstractAllGroupHeadsCollector c1 = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup); + indexSearcher.search(new TermQuery(new Term("content", "random")), c1); + assertTrue(arrayContains(new int[]{2, 3, 5, 7}, c1.retrieveGroupHeads())); + assertTrue(openBitSetContains(new int[]{2, 3, 5, 7}, c1.retrieveGroupHeads(maxDoc), maxDoc)); + + AbstractAllGroupHeadsCollector c2 = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup); + indexSearcher.search(new TermQuery(new Term("content", "some")), c2); + assertTrue(arrayContains(new int[]{2, 3, 4}, c2.retrieveGroupHeads())); + assertTrue(openBitSetContains(new int[]{2, 3, 4}, c2.retrieveGroupHeads(maxDoc), maxDoc)); + + AbstractAllGroupHeadsCollector c3 = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup); + indexSearcher.search(new TermQuery(new Term("content", "blob")), c3); + assertTrue(arrayContains(new int[]{1, 5}, c3.retrieveGroupHeads())); + assertTrue(openBitSetContains(new int[]{1, 5}, c3.retrieveGroupHeads(maxDoc), maxDoc)); + + // STRING sort type triggers different implementation + Sort sortWithinGroup2 = new Sort(new SortField("id", SortField.STRING, true)); + AbstractAllGroupHeadsCollector c4 = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup2); + indexSearcher.search(new TermQuery(new Term("content", "random")), c4); + assertTrue(arrayContains(new int[]{2, 3, 5, 7}, c4.retrieveGroupHeads())); + assertTrue(openBitSetContains(new int[]{2, 3, 5, 7}, c4.retrieveGroupHeads(maxDoc), maxDoc)); + + Sort sortWithinGroup3 = new Sort(new SortField("id", SortField.STRING, false)); + AbstractAllGroupHeadsCollector c5 = TermAllGroupHeadsCollector.create(groupField, sortWithinGroup3); + indexSearcher.search(new TermQuery(new Term("content", "random")), c5); + // 7 b/c higher doc id wins, even if order of field is in not in reverse. + assertTrue(arrayContains(new int[]{0, 3, 4, 6}, c5.retrieveGroupHeads())); + assertTrue(openBitSetContains(new int[]{0, 3, 4, 6}, c5.retrieveGroupHeads(maxDoc), maxDoc)); + + indexSearcher.getIndexReader().close(); + dir.close(); + } + + public void testRandom() throws Exception { + int numberOfRuns = _TestUtil.nextInt(random, 3, 6); + for (int iter = 0; iter < numberOfRuns; iter++) { + if (VERBOSE) { + System.out.println(String.format("TEST: iter=%d total=%d", iter, numberOfRuns)); + } + + final int numDocs = _TestUtil.nextInt(random, 100, 1000) * RANDOM_MULTIPLIER; + final int numGroups = _TestUtil.nextInt(random, 1, numDocs); + + if (VERBOSE) { + System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups); + } + + final List groups = new ArrayList(); + for (int i = 0; i < numGroups; i++) { + groups.add(_TestUtil.randomRealisticUnicodeString(random)); + } + final String[] contentStrings = new String[_TestUtil.nextInt(random, 2, 20)]; + if (VERBOSE) { + System.out.println("TEST: create fake content"); + } + for (int contentIDX = 0; contentIDX < contentStrings.length; contentIDX++) { + final StringBuilder sb = new StringBuilder(); + sb.append("real").append(random.nextInt(3)).append(' '); + final int fakeCount = random.nextInt(10); + for (int fakeIDX = 0; fakeIDX < fakeCount; fakeIDX++) { + sb.append("fake "); + } + contentStrings[contentIDX] = sb.toString(); + if (VERBOSE) { + System.out.println(" content=" + sb.toString()); + } + } + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter( + random, + dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random))); + + Document doc = new Document(); + Document docNoGroup = new Document(); + Field group = newField("group", "", Field.Index.NOT_ANALYZED); + doc.add(group); + Field sort1 = newField("sort1", "", Field.Index.NOT_ANALYZED); + doc.add(sort1); + docNoGroup.add(sort1); + Field sort2 = newField("sort2", "", Field.Index.NOT_ANALYZED); + doc.add(sort2); + docNoGroup.add(sort2); + Field sort3 = newField("sort3", "", Field.Index.NOT_ANALYZED); + doc.add(sort3); + docNoGroup.add(sort3); + Field content = newField("content", "", Field.Index.ANALYZED); + doc.add(content); + docNoGroup.add(content); + NumericField id = new NumericField("id"); + doc.add(id); + docNoGroup.add(id); + final GroupDoc[] groupDocs = new GroupDoc[numDocs]; + for (int i = 0; i < numDocs; i++) { + final String groupValue; + if (random.nextInt(24) == 17) { + // So we test the "doc doesn't have the group'd + // field" case: + groupValue = null; + } else { + groupValue = groups.get(random.nextInt(groups.size())); + } + + final GroupDoc groupDoc = new GroupDoc( + i, + groupValue, + groups.get(random.nextInt(groups.size())), + groups.get(random.nextInt(groups.size())), + String.format("%05d", i), + contentStrings[random.nextInt(contentStrings.length)] + ); + + if (VERBOSE) { + System.out.println(" doc content=" + groupDoc.content + " id=" + i + " group=" + (groupDoc.group == null ? "null" : groupDoc.group) + " sort1=" + groupDoc.sort1 + " sort2=" + groupDoc.sort2 + " sort3=" + groupDoc.sort3); + } + + groupDocs[i] = groupDoc; + if (groupDoc.group != null) { + group.setValue(groupDoc.group); + } + sort1.setValue(groupDoc.sort1); + sort2.setValue(groupDoc.sort2); + sort3.setValue(groupDoc.sort3); + content.setValue(groupDoc.content); + id.setIntValue(groupDoc.id); + if (groupDoc.group == null) { + w.addDocument(docNoGroup); + } else { + w.addDocument(doc); + } + } + + final IndexReader r = w.getReader(); + w.close(); + + // NOTE: intentional but temporary field cache insanity! + final int[] docIdToFieldId = FieldCache.DEFAULT.getInts(r, "id"); + final int[] fieldIdToDocID = new int[numDocs]; + for (int i = 0; i < docIdToFieldId.length; i++) { + int fieldId = docIdToFieldId[i]; + fieldIdToDocID[fieldId] = i; + } + + try { + final IndexSearcher s = newSearcher(r); + + for (int contentID = 0; contentID < 3; contentID++) { + final ScoreDoc[] hits = s.search(new TermQuery(new Term("content", "real" + contentID)), numDocs).scoreDocs; + for (ScoreDoc hit : hits) { + final GroupDoc gd = groupDocs[docIdToFieldId[hit.doc]]; + assertTrue(gd.score == 0.0); + gd.score = hit.score; + int docId = gd.id; + assertEquals(docId, docIdToFieldId[hit.doc]); + } + } + + for (GroupDoc gd : groupDocs) { + assertTrue(gd.score != 0.0); + } + + for (int searchIter = 0; searchIter < 100; searchIter++) { + + if (VERBOSE) { + System.out.println("TEST: searchIter=" + searchIter); + } + + final String searchTerm = "real" + random.nextInt(3); + boolean sortByScoreOnly = random.nextBoolean(); + Sort sortWithinGroup = getRandomSort(sortByScoreOnly); + AbstractAllGroupHeadsCollector allGroupHeadsCollector = TermAllGroupHeadsCollector.create("group", sortWithinGroup); + s.search(new TermQuery(new Term("content", searchTerm)), allGroupHeadsCollector); + int[] expectedGroupHeads = createExpectedGroupHeads(searchTerm, groupDocs, sortWithinGroup, sortByScoreOnly, fieldIdToDocID); + int[] actualGroupHeads = allGroupHeadsCollector.retrieveGroupHeads(); + // The actual group heads contains Lucene ids. Need to change them into our id value. + for (int i = 0; i < actualGroupHeads.length; i++) { + actualGroupHeads[i] = docIdToFieldId[actualGroupHeads[i]]; + } + // Allows us the easily iterate and assert the actual and expected results. + Arrays.sort(expectedGroupHeads); + Arrays.sort(actualGroupHeads); + + if (VERBOSE) { + System.out.println("Collector: " + allGroupHeadsCollector.getClass().getSimpleName()); + System.out.println("Sort within group: " + sortWithinGroup); + System.out.println("Num group: " + numGroups); + System.out.println("Num doc: " + numDocs); + System.out.println("\n=== Expected: \n"); + for (int expectedDocId : expectedGroupHeads) { + GroupDoc expectedGroupDoc = groupDocs[expectedDocId]; + String expectedGroup = expectedGroupDoc.group == null ? null : expectedGroupDoc.group; + System.out.println( + String.format( + "Group:%10s score%5f Sort1:%10s Sort2:%10s Sort3:%10s doc:%5d", + expectedGroup, expectedGroupDoc.score, expectedGroupDoc.sort1, + expectedGroupDoc.sort2, expectedGroupDoc.sort3, expectedDocId + ) + ); + } + System.out.println("\n=== Actual: \n"); + for (int actualDocId : actualGroupHeads) { + GroupDoc actualGroupDoc = groupDocs[actualDocId]; + String actualGroup = actualGroupDoc.group == null ? null : actualGroupDoc.group; + System.out.println( + String.format( + "Group:%10s score%5f Sort1:%10s Sort2:%10s Sort3:%10s doc:%5d", + actualGroup, actualGroupDoc.score, actualGroupDoc.sort1, + actualGroupDoc.sort2, actualGroupDoc.sort3, actualDocId + ) + ); + } + System.out.println("\n==================================================================================="); + } + + assertEquals(expectedGroupHeads.length, actualGroupHeads.length); + for (int i = 0; i < expectedGroupHeads.length; i++) { + assertEquals(expectedGroupHeads[i], actualGroupHeads[i]); + } + } + s.close(); + } finally { + FieldCache.DEFAULT.purge(r); + } + + r.close(); + dir.close(); + } + } + + + private boolean arrayContains(int[] expected, int[] actual) { + if (expected.length != actual.length) { + return false; + } + + for (int e : expected) { + boolean found = false; + for (int a : actual) { + if (e == a) { + found = true; + } + } + + if (!found) { + return false; + } + } + + return true; + } + + private boolean openBitSetContains(int[] expectedDocs, FixedBitSet actual, int maxDoc) throws IOException { + if (expectedDocs.length != actual.cardinality()) { + return false; + } + + FixedBitSet expected = new FixedBitSet(maxDoc); + for (int expectedDoc : expectedDocs) { + expected.set(expectedDoc); + } + + int docId; + DocIdSetIterator iterator = expected.iterator(); + while ((docId = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + if (!actual.get(docId)) { + return false; + } + } + + return true; + } + + private int[] createExpectedGroupHeads(String searchTerm, GroupDoc[] groupDocs, Sort docSort, boolean sortByScoreOnly, int[] fieldIdToDocID) throws IOException { + Map> groupHeads = new HashMap>(); + for (GroupDoc groupDoc : groupDocs) { + if (!groupDoc.content.startsWith(searchTerm)) { + continue; + } + + if (!groupHeads.containsKey(groupDoc.group)) { + List list = new ArrayList(); + list.add(groupDoc); + groupHeads.put(groupDoc.group, list); + continue; + } + groupHeads.get(groupDoc.group).add(groupDoc); + } + + int[] allGroupHeads = new int[groupHeads.size()]; + int i = 0; + for (String groupValue : groupHeads.keySet()) { + List docs = groupHeads.get(groupValue); + Collections.sort(docs, getComparator(docSort, sortByScoreOnly, fieldIdToDocID)); + allGroupHeads[i++] = docs.get(0).id; + } + + return allGroupHeads; + } + + private Sort getRandomSort(boolean scoreOnly) { + final List sortFields = new ArrayList(); + if (random.nextInt(7) == 2 || scoreOnly) { + sortFields.add(SortField.FIELD_SCORE); + } else { + if (random.nextBoolean()) { + if (random.nextBoolean()) { + sortFields.add(new SortField("sort1", SortField.STRING, random.nextBoolean())); + } else { + sortFields.add(new SortField("sort2", SortField.STRING, random.nextBoolean())); + } + } else if (random.nextBoolean()) { + sortFields.add(new SortField("sort1", SortField.STRING, random.nextBoolean())); + sortFields.add(new SortField("sort2", SortField.STRING, random.nextBoolean())); + } + } + // Break ties: + if (random.nextBoolean() && !scoreOnly) { + sortFields.add(new SortField("sort3", SortField.STRING)); + } else if (!scoreOnly) { + sortFields.add(new SortField("id", SortField.INT)); + } + return new Sort(sortFields.toArray(new SortField[sortFields.size()])); + } + + private Comparator getComparator(Sort sort, final boolean sortByScoreOnly, final int[] fieldIdToDocID) { + final SortField[] sortFields = sort.getSort(); + return new Comparator() { + public int compare(GroupDoc d1, GroupDoc d2) { + for (SortField sf : sortFields) { + final int cmp; + if (sf.getType() == SortField.SCORE) { + if (d1.score > d2.score) { + cmp = -1; + } else if (d1.score < d2.score) { + cmp = 1; + } else { + cmp = sortByScoreOnly ? fieldIdToDocID[d1.id] - fieldIdToDocID[d2.id] : 0; + } + } else if (sf.getField().equals("sort1")) { + cmp = d1.sort1.compareTo(d2.sort1); + } else if (sf.getField().equals("sort2")) { + cmp = d1.sort2.compareTo(d2.sort2); + } else if (sf.getField().equals("sort3")) { + cmp = d1.sort3.compareTo(d2.sort3); + } else { + assertEquals(sf.getField(), "id"); + cmp = d1.id - d2.id; + } + if (cmp != 0) { + return sf.getReverse() ? -cmp : cmp; + } + } + // Our sort always fully tie breaks: + fail(); + return 0; + } + }; + } + + + private static class GroupDoc { + final int id; + final String group; + final String sort1; + final String sort2; + final String sort3; + // content must be "realN ..." + final String content; + float score; + + public GroupDoc(int id, String group, String sort1, String sort2, String sort3, String content) { + this.id = id; + this.group = group; + this.sort1 = sort1; + this.sort2 = sort2; + this.sort3 = sort3; + this.content = content; + } + + } + +}