2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org.apache.lucene.benchmark.quality.trec;
19 import java.io.BufferedReader;
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.HashMap;
25 import org.apache.lucene.benchmark.quality.QualityQuery;
30 * Expects this topic format -
33 * <num> Number: nnn
35 * <title> title of the topic
37 * <desc> Description:
38 * description of the topic
40 * <narr> Narrative:
41 * "story" composed by assessors.
45 * Comment lines starting with '#' are ignored.
47 public class TrecTopicsReader {
49 private static final String newline = System.getProperty("line.separator");
52 * Constructor for Trec's TopicsReader
54 public TrecTopicsReader() {
59 * Read quality queries from trec format topics file.
60 * @param reader where queries are read from.
61 * @return the result quality queries.
62 * @throws IOException if cannot read the queries.
64 public QualityQuery[] readQueries(BufferedReader reader) throws IOException {
65 ArrayList<QualityQuery> res = new ArrayList<QualityQuery>();
68 while (null!=(sb=read(reader,"<top>",null,false,false))) {
69 HashMap<String,String> fields = new HashMap<String,String>();
71 sb = read(reader,"<num>",null,true,false);
72 int k = sb.indexOf(":");
73 String id = sb.substring(k+1).trim();
75 sb = read(reader,"<title>",null,true,false);
77 String title = sb.substring(k+1).trim();
79 read(reader,"<desc>",null,false,false);
82 while ((line = reader.readLine()) != null) {
83 if (line.startsWith("<narr>"))
85 if (sb.length() > 0) sb.append(' ');
88 String description = sb.toString().trim();
91 while ((line = reader.readLine()) != null) {
92 if (line.startsWith("</top>"))
94 if (sb.length() > 0) sb.append(' ');
97 String narrative = sb.toString().trim();
99 fields.put("title",title);
100 fields.put("description",description);
101 fields.put("narrative", narrative);
102 QualityQuery topic = new QualityQuery(id,fields);
108 // sort result array (by ID)
109 QualityQuery qq[] = res.toArray(new QualityQuery[0]);
114 // read until finding a line that starts with the specified prefix
115 private StringBuilder read (BufferedReader reader, String prefix, StringBuilder sb, boolean collectMatchLine, boolean collectAll) throws IOException {
116 sb = (sb==null ? new StringBuilder() : sb);
119 String line = reader.readLine();
123 if (line.startsWith(prefix)) {
124 if (collectMatchLine) {
135 //System.out.println("read: "+sb);