pylucene 3.5.0-3
[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / benchmark / src / java / org / apache / lucene / benchmark / quality / trec / TrecJudge.java
diff --git a/lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecJudge.java b/lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecJudge.java
new file mode 100644 (file)
index 0000000..540c7f1
--- /dev/null
@@ -0,0 +1,156 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.benchmark.quality.trec;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.StringTokenizer;
+
+import org.apache.lucene.benchmark.quality.Judge;
+import org.apache.lucene.benchmark.quality.QualityQuery;
+
+/**
+ * Judge if given document is relevant to given quality query, based on Trec format for judgements.
+ */
+public class TrecJudge implements Judge {
+
+  HashMap<String,QRelJudgement> judgements;
+  
+  /**
+   * Constructor from a reader.
+   * <p>
+   * Expected input format:
+   * <pre>
+   *     qnum  0   doc-name     is-relevant
+   * </pre> 
+   * Two sample lines:
+   * <pre> 
+   *     19    0   doc303       1
+   *     19    0   doc7295      0
+   * </pre> 
+   * @param reader where judgments are read from.
+   * @throws IOException 
+   */
+  public TrecJudge (BufferedReader reader) throws IOException {
+    judgements = new HashMap<String,QRelJudgement>();
+    QRelJudgement curr = null;
+    String zero = "0";
+    String line;
+    
+    try {
+      while (null!=(line=reader.readLine())) {
+        line = line.trim();
+        if (line.length()==0 || '#'==line.charAt(0)) {
+          continue;
+        }
+        StringTokenizer st = new StringTokenizer(line);
+        String queryID = st.nextToken();
+        st.nextToken();
+        String docName = st.nextToken();
+        boolean relevant = !zero.equals(st.nextToken());
+        assert !st.hasMoreTokens() : "wrong format: "+line+"  next: "+st.nextToken();
+        if (relevant) { // only keep relevant docs
+          if (curr==null || !curr.queryID.equals(queryID)) {
+            curr = judgements.get(queryID);
+            if (curr==null) {
+              curr = new QRelJudgement(queryID);
+              judgements.put(queryID,curr);
+            }
+          }
+          curr.addRelevandDoc(docName);
+        }
+      }
+    } finally {
+      reader.close();
+    }
+  }
+  
+  // inherit javadocs
+  public boolean isRelevant(String docName, QualityQuery query) {
+    QRelJudgement qrj = judgements.get(query.getQueryID());
+    return qrj!=null && qrj.isRelevant(docName);
+  }
+
+  /** single Judgement of a trec quality query */
+  private static class QRelJudgement {
+    String queryID;
+    private HashMap<String,String> relevantDocs;
+    
+    QRelJudgement(String queryID) {
+      this.queryID = queryID;
+      relevantDocs = new HashMap<String,String>();
+    }
+    
+    public void addRelevandDoc(String docName) {
+      relevantDocs.put(docName,docName);
+    }
+
+    boolean isRelevant(String docName) {
+      return relevantDocs.containsKey(docName);
+    }
+
+    public int maxRecall() {
+      return relevantDocs.size();
+    }
+  }
+
+  // inherit javadocs
+  public boolean validateData(QualityQuery[] qq, PrintWriter logger) {
+    HashMap<String,QRelJudgement> missingQueries = new HashMap<String, QRelJudgement>(judgements);
+    ArrayList<String> missingJudgements = new ArrayList<String>();
+    for (int i=0; i<qq.length; i++) {
+      String id = qq[i].getQueryID();
+      if (missingQueries.containsKey(id)) {
+        missingQueries.remove(id);
+      } else {
+        missingJudgements.add(id);
+      }
+    }
+    boolean isValid = true;
+    if (missingJudgements.size()>0) {
+      isValid = false;
+      if (logger!=null) {
+        logger.println("WARNING: "+missingJudgements.size()+" queries have no judgments! - ");
+        for (int i=0; i<missingJudgements.size(); i++) {
+          logger.println("   "+ missingJudgements.get(i));
+        }
+      }
+    }
+    if (missingQueries.size()>0) {
+      isValid = false;
+      if (logger!=null) {
+        logger.println("WARNING: "+missingQueries.size()+" judgments match no query! - ");
+        for (final String id : missingQueries.keySet()) {
+          logger.println("   "+id);
+        }
+      }
+    }
+    return isValid;
+  }
+
+  // inherit javadocs
+  public int maxRecall(QualityQuery query) {
+    QRelJudgement qrj = judgements.get(query.getQueryID());
+    if (qrj!=null) {
+      return qrj.maxRecall();
+    }
+    return 0;
+  }
+}