1 package org.apache.lucene.benchmark.byTask.feeds;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.BufferedReader;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.lang.reflect.Constructor;
26 import java.util.Arrays;
27 import java.util.Properties;
29 import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
30 import org.apache.lucene.benchmark.byTask.utils.Config;
31 import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
34 * A {@link ContentSource} reading one line at a time as a
35 * {@link org.apache.lucene.document.Document} from a single file. This saves IO
36 * cost (over DirContentSource) of recursing through a directory and opening a
37 * new file for every document.<br>
38 * The expected format of each line is (arguments are separated by <TAB>):
39 * <i>title, date, body</i>. If a line is read in a different format, a
40 * {@link RuntimeException} will be thrown. In general, you should use this
41 * content source for files that were created with {@link WriteLineDocTask}.<br>
45 * <li>docs.file=<path to the file>
46 * <li>content.source.encoding - default to UTF-8.
47 * <li>line.parser - default to {@link HeaderLineParser} if a header line exists which differs
48 * from {@link WriteLineDocTask#DEFAULT_FIELDS} and to {@link SimpleLineParser} otherwise.
51 public class LineDocSource extends ContentSource {
53 /** Reader of a single input line into {@link DocData}. */
54 public static abstract class LineParser {
55 protected final String[] header;
56 /** Construct with the header
57 * @param header header line found in the input file, or null if none
59 public LineParser(String[] header) {
62 /** parse an input line and fill doc data appropriately */
63 public abstract void parseLine(DocData docData, String line);
67 * {@link LineParser} which ignores the header passed to its constructor
68 * and assumes simply that field names and their order are the same
69 * as in {@link WriteLineDocTask#DEFAULT_FIELDS}
71 public static class SimpleLineParser extends LineParser {
72 public SimpleLineParser(String[] header) {
76 public void parseLine(DocData docData, String line) {
78 int k2 = line.indexOf(WriteLineDocTask.SEP, k1);
80 throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator title::date)!");
82 docData.setTitle(line.substring(k1,k2));
84 k2 = line.indexOf(WriteLineDocTask.SEP, k1);
86 throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator date::body)!");
88 docData.setDate(line.substring(k1,k2));
90 k2 = line.indexOf(WriteLineDocTask.SEP, k1);
92 throw new RuntimeException("line: [" + line + "] is in an invalid format (too many separators)!");
95 docData.setBody(line.substring(k1));
100 * {@link LineParser} which sets field names and order by
101 * the header - any header - of the lines file.
102 * It is less efficient than {@link SimpleLineParser} but more powerful.
104 public static class HeaderLineParser extends LineParser {
105 private enum FieldName { NAME , TITLE , DATE , BODY, PROP }
106 private final FieldName[] posToF;
107 public HeaderLineParser(String[] header) {
109 posToF = new FieldName[header.length];
110 for (int i=0; i<header.length; i++) {
111 String f = header[i];
112 if (DocMaker.NAME_FIELD.equals(f)) {
113 posToF[i] = FieldName.NAME;
114 } else if (DocMaker.TITLE_FIELD.equals(f)) {
115 posToF[i] = FieldName.TITLE;
116 } else if (DocMaker.DATE_FIELD.equals(f)) {
117 posToF[i] = FieldName.DATE;
118 } else if (DocMaker.BODY_FIELD.equals(f)) {
119 posToF[i] = FieldName.BODY;
121 posToF[i] = FieldName.PROP;
127 public void parseLine(DocData docData, String line) {
131 while ((k2 = line.indexOf(WriteLineDocTask.SEP, k1)) >= 0) {
132 if (n>=header.length) {
133 throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
135 setDocDataField(docData, n, line.substring(k1,k2));
139 if (n!=header.length-1) {
140 throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
143 setDocDataField(docData, n, line.substring(k1));
146 private void setDocDataField(DocData docData, int position, String text) {
147 switch(posToF[position]) {
149 docData.setName(text);
152 docData.setTitle(text);
155 docData.setDate(text);
158 docData.setBody(text);
161 Properties p = docData.getProps();
163 p = new Properties();
166 p.setProperty(header[position], text);
173 private BufferedReader reader;
174 private int readCount;
176 private LineParser docDataLineReader = null;
177 private boolean skipHeaderLine = false;
179 private synchronized void openFile() {
181 if (reader != null) {
184 InputStream is = StreamUtils.inputStream(file);
185 reader = new BufferedReader(new InputStreamReader(is, encoding), StreamUtils.BUFFER_SIZE);
186 if (skipHeaderLine) {
187 reader.readLine(); // skip one line - the header line - already handled that info
189 } catch (IOException e) {
190 throw new RuntimeException(e);
195 public void close() throws IOException {
196 if (reader != null) {
203 public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
208 line = reader.readLine();
211 throw new NoMoreDataException();
215 return getNextDocData(docData);
217 if (docDataLineReader == null) { // first line ever, one time initialization,
218 docDataLineReader = createDocDataLineReader(line);
219 if (skipHeaderLine) {
220 return getNextDocData(docData);
223 // increment IDS only once...
227 // The date String was written in the format of DateTools.dateToString.
230 docDataLineReader.parseLine(docData, line);
234 private LineParser createDocDataLineReader(String line) {
236 String headIndicator = WriteLineDocTask.FIELDS_HEADER_INDICATOR + WriteLineDocTask.SEP;
238 if (line.startsWith(headIndicator)) {
239 header = line.substring(headIndicator.length()).split(Character.toString(WriteLineDocTask.SEP));
240 skipHeaderLine = true; // mark to skip the header line when input file is reopened
242 header = WriteLineDocTask.DEFAULT_FIELDS;
245 // if a specific DocDataLineReader was configured, must respect it
246 String docDataLineReaderClassName = getConfig().get("line.parser", null);
247 if (docDataLineReaderClassName!=null) {
249 final Class<? extends LineParser> clazz =
250 Class.forName(docDataLineReaderClassName).asSubclass(LineParser.class);
251 Constructor<? extends LineParser> cnstr = clazz.getConstructor(new Class[]{String[].class});
252 return cnstr.newInstance((Object)header);
253 } catch (Exception e) {
254 throw new RuntimeException("Failed to instantiate "+docDataLineReaderClassName, e);
258 // if this the simple case,
259 if (Arrays.deepEquals(header, WriteLineDocTask.DEFAULT_FIELDS)) {
260 return new SimpleLineParser(header);
262 return new HeaderLineParser(header);
266 public void resetInputs() throws IOException {
272 public void setConfig(Config config) {
273 super.setConfig(config);
274 String fileName = config.get("docs.file", null);
275 if (fileName == null) {
276 throw new IllegalArgumentException("docs.file must be set");
278 file = new File(fileName).getAbsoluteFile();
279 if (encoding == null) {