2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org.apache.lucene.benchmark.quality;
19 import java.io.PrintWriter;
20 import java.text.NumberFormat;
21 import java.util.ArrayList;
24 * Results of quality benchmark run for a single query or for a set of queries.
26 public class QualityStats {
28 /** Number of points for which precision is computed. */
29 public static final int MAX_POINTS = 20;
31 private double maxGoodPoints;
32 private double recall;
34 private double pReleventSum = 0;
35 private double numPoints = 0;
36 private double numGoodPoints = 0;
37 private double mrr = 0;
38 private long searchTime;
39 private long docNamesExtractTime;
42 * A certain rank in which a relevant doc was found.
44 public static class RecallPoint {
46 private double recall;
47 private RecallPoint(int rank, double recall) {
51 /** Returns the rank: where on the list of returned docs this relevant doc appeared. */
52 public int getRank() {
55 /** Returns the recall: how many relevant docs were returned up to this point, inclusive. */
56 public double getRecall() {
61 private ArrayList<RecallPoint> recallPoints;
64 * Construct a QualityStats object with anticipated maximal number of relevant hits.
65 * @param maxGoodPoints maximal possible relevant hits.
67 public QualityStats(double maxGoodPoints, long searchTime) {
68 this.maxGoodPoints = maxGoodPoints;
69 this.searchTime = searchTime;
70 this.recallPoints = new ArrayList<RecallPoint>();
71 pAt = new double[MAX_POINTS+1]; // pAt[0] unused.
75 * Add a (possibly relevant) doc.
76 * @param n rank of the added doc (its ordinal position within the query results).
77 * @param isRelevant true if the added doc is relevant, false otherwise.
79 public void addResult(int n, boolean isRelevant, long docNameExtractTime) {
80 if (Math.abs(numPoints+1 - n) > 1E-6) {
81 throw new IllegalArgumentException("point "+n+" illegal after "+numPoints+" points!");
85 recallPoints.add(new RecallPoint(n,numGoodPoints));
86 if (recallPoints.size()==1 && n<=5) { // first point, but only within 5 top scores.
91 double p = numGoodPoints / numPoints;
98 recall = maxGoodPoints<=0 ? p : numGoodPoints/maxGoodPoints;
99 docNamesExtractTime += docNameExtractTime;
103 * Return the precision at rank n:
104 * |{relevant hits within first <code>n</code> hits}| / <code>n</code>.
105 * @param n requested precision point, must be at least 1 and at most {@link #MAX_POINTS}.
107 public double getPrecisionAt(int n) {
108 if (n<1 || n>MAX_POINTS) {
109 throw new IllegalArgumentException("n="+n+" - but it must be in [1,"+MAX_POINTS+"] range!");
112 return (numPoints * pAt[(int)numPoints])/n;
118 * Return the average precision at recall points.
120 public double getAvp() {
121 return maxGoodPoints==0 ? 0 : pReleventSum/maxGoodPoints;
125 * Return the recall: |{relevant hits found}| / |{relevant hits existing}|.
127 public double getRecall() {
132 * Log information on this QualityStats object.
133 * @param logger Logger.
134 * @param prefix prefix before each log line.
136 public void log(String title, int paddLines, PrintWriter logger, String prefix) {
137 for (int i=0; i<paddLines; i++) {
140 if (title!=null && title.trim().length()>0) {
141 logger.println(title);
143 prefix = prefix==null ? "" : prefix;
144 NumberFormat nf = NumberFormat.getInstance();
145 nf.setMaximumFractionDigits(3);
146 nf.setMinimumFractionDigits(3);
147 nf.setGroupingUsed(true);
149 logger.println(prefix+format("Search Seconds: ",M)+
150 fracFormat(nf.format((double)searchTime/1000)));
151 logger.println(prefix+format("DocName Seconds: ",M)+
152 fracFormat(nf.format((double)docNamesExtractTime/1000)));
153 logger.println(prefix+format("Num Points: ",M)+
154 fracFormat(nf.format(numPoints)));
155 logger.println(prefix+format("Num Good Points: ",M)+
156 fracFormat(nf.format(numGoodPoints)));
157 logger.println(prefix+format("Max Good Points: ",M)+
158 fracFormat(nf.format(maxGoodPoints)));
159 logger.println(prefix+format("Average Precision: ",M)+
160 fracFormat(nf.format(getAvp())));
161 logger.println(prefix+format("MRR: ",M)+
162 fracFormat(nf.format(getMRR())));
163 logger.println(prefix+format("Recall: ",M)+
164 fracFormat(nf.format(getRecall())));
165 for (int i=1; i<(int)numPoints && i<pAt.length; i++) {
166 logger.println(prefix+format("Precision At "+i+": ",M)+
167 fracFormat(nf.format(getPrecisionAt(i))));
169 for (int i=0; i<paddLines; i++) {
174 private static String padd = " ";
175 private String format(String s, int minLen) {
176 s = (s==null ? "" : s);
177 int n = Math.max(minLen,s.length());
178 return (s+padd).substring(0,n);
180 private String fracFormat(String frac) {
181 int k = frac.indexOf('.');
182 String s1 = padd+frac.substring(0,k);
183 int n = Math.max(k,6);
184 s1 = s1.substring(s1.length()-n);
185 return s1 + frac.substring(k);
189 * Create a QualityStats object that is the average of the input QualityStats objects.
190 * @param stats array of input stats to be averaged.
191 * @return an average over the input stats.
193 public static QualityStats average(QualityStats[] stats) {
194 QualityStats avg = new QualityStats(0,0);
195 if (stats.length==0) {
196 // weired, no stats to average!
199 int m = 0; // queries with positive judgements
201 for (int i=0; i<stats.length; i++) {
202 avg.searchTime += stats[i].searchTime;
203 avg.docNamesExtractTime += stats[i].docNamesExtractTime;
204 if (stats[i].maxGoodPoints>0) {
206 avg.numGoodPoints += stats[i].numGoodPoints;
207 avg.numPoints += stats[i].numPoints;
208 avg.pReleventSum += stats[i].getAvp();
209 avg.recall += stats[i].recall;
210 avg.mrr += stats[i].getMRR();
211 avg.maxGoodPoints += stats[i].maxGoodPoints;
212 for (int j=1; j<avg.pAt.length; j++) {
213 avg.pAt[j] += stats[i].getPrecisionAt(j);
217 assert m>0 : "Fishy: no \"good\" queries!";
218 // take average: times go by all queries, other measures go by "good" queries only.
219 avg.searchTime /= stats.length;
220 avg.docNamesExtractTime /= stats.length;
221 avg.numGoodPoints /= m;
225 avg.maxGoodPoints /= m;
226 for (int j=1; j<avg.pAt.length; j++) {
229 avg.pReleventSum /= m; // this is actually avgp now
230 avg.pReleventSum *= avg.maxGoodPoints; // so that getAvgP() would be correct
236 * Returns the time it took to extract doc names for judging the measured query, in milliseconds.
238 public long getDocNamesExtractTime() {
239 return docNamesExtractTime;
243 * Returns the maximal number of good points.
244 * This is the number of relevant docs known by the judge for the measured query.
246 public double getMaxGoodPoints() {
247 return maxGoodPoints;
251 * Returns the number of good points (only relevant points).
253 public double getNumGoodPoints() {
254 return numGoodPoints;
258 * Returns the number of points (both relevant and irrelevant points).
260 public double getNumPoints() {
265 * Returns the recallPoints.
267 public RecallPoint [] getRecallPoints() {
268 return recallPoints.toArray(new RecallPoint[0]);
272 * Returns the Mean reciprocal rank over the queries or RR for a single query.
274 * Reciprocal rank is defined as <code>1/r</code> where <code>r</code> is the
275 * rank of the first correct result, or <code>0</code> if there are no correct
276 * results within the top 5 results.
278 * This follows the definition in
279 * <a href="http://www.cnlp.org/publications/02cnlptrec10.pdf">
280 * Question Answering - CNLP at the TREC-10 Question Answering Track</a>.
282 public double getMRR() {
288 * Returns the search time in milliseconds for the measured query.
290 public long getSearchTime() {