pylucene 3.5.0-3
[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / benchmark / src / java / org / apache / lucene / benchmark / quality / trec / TrecTopicsReader.java
diff --git a/lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecTopicsReader.java b/lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecTopicsReader.java
new file mode 100644 (file)
index 0000000..e84aa51
--- /dev/null
@@ -0,0 +1,138 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.benchmark.quality.trec;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import org.apache.lucene.benchmark.quality.QualityQuery;
+
+/**
+ * Read TREC topics.
+ * <p>
+ * Expects this topic format -
+ * <pre>
+ *   &lt;top&gt;
+ *   &lt;num&gt; Number: nnn
+ *     
+ *   &lt;title&gt; title of the topic
+ *     
+ *   &lt;desc&gt; Description:
+ *   description of the topic
+ *     
+ *   &lt;narr&gt; Narrative:
+ *   "story" composed by assessors.
+ *    
+ *   &lt;/top&gt;
+ * </pre>
+ * Comment lines starting with '#' are ignored.
+ */
+public class TrecTopicsReader {
+
+  private static final String newline = System.getProperty("line.separator");
+  
+  /**
+   *  Constructor for Trec's TopicsReader
+   */
+  public TrecTopicsReader() {
+    super();
+  }
+
+  /**
+   * Read quality queries from trec format topics file.
+   * @param reader where queries are read from.
+   * @return the result quality queries.
+   * @throws IOException if cannot read the queries.
+   */
+  public QualityQuery[] readQueries(BufferedReader reader) throws IOException {
+    ArrayList<QualityQuery> res = new ArrayList<QualityQuery>();
+    StringBuilder sb;
+    try {
+      while (null!=(sb=read(reader,"<top>",null,false,false))) {
+        HashMap<String,String> fields = new HashMap<String,String>();
+        // id
+        sb = read(reader,"<num>",null,true,false);
+        int k = sb.indexOf(":");
+        String id = sb.substring(k+1).trim();
+        // title
+        sb = read(reader,"<title>",null,true,false);
+        k = sb.indexOf(">");
+        String title = sb.substring(k+1).trim();
+        // description
+        read(reader,"<desc>",null,false,false);
+        sb.setLength(0);
+        String line = null;
+        while ((line = reader.readLine()) != null) {
+          if (line.startsWith("<narr>"))
+            break;
+          if (sb.length() > 0) sb.append(' ');
+          sb.append(line);
+        }
+        String description = sb.toString().trim();
+        // narrative
+        sb.setLength(0);
+        while ((line = reader.readLine()) != null) {
+          if (line.startsWith("</top>"))
+            break;
+          if (sb.length() > 0) sb.append(' ');
+          sb.append(line);
+        }
+        String narrative = sb.toString().trim();
+        // we got a topic!
+        fields.put("title",title);
+        fields.put("description",description);
+        fields.put("narrative", narrative);
+        QualityQuery topic = new QualityQuery(id,fields);
+        res.add(topic);
+      }
+    } finally {
+      reader.close();
+    }
+    // sort result array (by ID) 
+    QualityQuery qq[] = res.toArray(new QualityQuery[0]);
+    Arrays.sort(qq);
+    return qq;
+  }
+
+  // read until finding a line that starts with the specified prefix
+  private StringBuilder read (BufferedReader reader, String prefix, StringBuilder sb, boolean collectMatchLine, boolean collectAll) throws IOException {
+    sb = (sb==null ? new StringBuilder() : sb);
+    String sep = "";
+    while (true) {
+      String line = reader.readLine();
+      if (line==null) {
+        return null;
+      }
+      if (line.startsWith(prefix)) {
+        if (collectMatchLine) {
+          sb.append(sep+line);
+          sep = newline;
+        }
+        break;
+      }
+      if (collectAll) {
+        sb.append(sep+line);
+        sep = newline;
+      }
+    }
+    //System.out.println("read: "+sb);
+    return sb;
+  }
+}