1 package org.apache.lucene.benchmark.byTask.feeds;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.BufferedReader;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.lang.reflect.Constructor;
26 import java.util.Arrays;
27 import java.util.Properties;
29 import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
30 import org.apache.lucene.benchmark.byTask.utils.Config;
31 import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
34 * A {@link ContentSource} reading one line at a time as a
35 * {@link org.apache.lucene.document.Document} from a single file. This saves IO
36 * cost (over DirContentSource) of recursing through a directory and opening a
37 * new file for every document.<br>
38 * The expected format of each line is (arguments are separated by <TAB>):
39 * <i>title, date, body</i>. If a line is read in a different format, a
40 * {@link RuntimeException} will be thrown. In general, you should use this
41 * content source for files that were created with {@link WriteLineDocTask}.<br>
45 * <li>docs.file=<path to the file>
46 * <li>content.source.encoding - default to UTF-8.
47 * <li>line.parser - default to {@link HeaderLineParser} if a header line exists which differs
48 * from {@link WriteLineDocTask#DEFAULT_FIELDS} and to {@link SimpleLineParser} otherwise.
51 public class LineDocSource extends ContentSource {
53 /** Reader of a single input line into {@link DocData}. */
54 public static abstract class LineParser {
55 protected final String[] header;
56 /** Construct with the header
57 * @param header header line found in the input file, or null if none
59 public LineParser(String[] header) {
62 /** parse an input line and fill doc data appropriately */
63 public abstract void parseLine(DocData docData, String line);
67 * {@link LineParser} which ignores the header passed to its constructor
68 * and assumes simply that field names and their order are the same
69 * as in {@link WriteLineDocTask#DEFAULT_FIELDS}
71 public static class SimpleLineParser extends LineParser {
72 public SimpleLineParser(String[] header) {
75 public void parseLine(DocData docData, String line) {
77 int k2 = line.indexOf(WriteLineDocTask.SEP, k1);
79 throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator title::date)!");
81 docData.setTitle(line.substring(k1,k2));
83 k2 = line.indexOf(WriteLineDocTask.SEP, k1);
85 throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator date::body)!");
87 docData.setDate(line.substring(k1,k2));
89 k2 = line.indexOf(WriteLineDocTask.SEP, k1);
91 throw new RuntimeException("line: [" + line + "] is in an invalid format (too many separators)!");
94 docData.setBody(line.substring(k1));
99 * {@link LineParser} which sets field names and order by
100 * the header - any header - of the lines file.
101 * It is less efficient than {@link SimpleLineParser} but more powerful.
103 public static class HeaderLineParser extends LineParser {
104 private enum FieldName { NAME , TITLE , DATE , BODY, PROP }
105 private final FieldName[] posToF;
106 public HeaderLineParser(String[] header) {
108 posToF = new FieldName[header.length];
109 for (int i=0; i<header.length; i++) {
110 String f = header[i];
111 if (DocMaker.NAME_FIELD.equals(f)) {
112 posToF[i] = FieldName.NAME;
113 } else if (DocMaker.TITLE_FIELD.equals(f)) {
114 posToF[i] = FieldName.TITLE;
115 } else if (DocMaker.DATE_FIELD.equals(f)) {
116 posToF[i] = FieldName.DATE;
117 } else if (DocMaker.BODY_FIELD.equals(f)) {
118 posToF[i] = FieldName.BODY;
120 posToF[i] = FieldName.PROP;
125 public void parseLine(DocData docData, String line) {
129 while ((k2 = line.indexOf(WriteLineDocTask.SEP, k1)) >= 0) {
130 if (n>=header.length) {
131 throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
133 setDocDataField(docData, n, line.substring(k1,k2));
137 if (n!=header.length-1) {
138 throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
141 setDocDataField(docData, n, line.substring(k1));
144 private void setDocDataField(DocData docData, int position, String text) {
145 switch(posToF[position]) {
147 docData.setName(text);
150 docData.setTitle(text);
153 docData.setDate(text);
156 docData.setBody(text);
159 Properties p = docData.getProps();
161 p = new Properties();
164 p.setProperty(header[position], text);
171 private BufferedReader reader;
172 private int readCount;
174 private LineParser docDataLineReader = null;
175 private boolean skipHeaderLine = false;
177 private synchronized void openFile() {
179 if (reader != null) {
182 InputStream is = StreamUtils.inputStream(file);
183 reader = new BufferedReader(new InputStreamReader(is, encoding), StreamUtils.BUFFER_SIZE);
184 if (skipHeaderLine) {
185 reader.readLine(); // skip one line - the header line - already handled that info
187 } catch (IOException e) {
188 throw new RuntimeException(e);
193 public void close() throws IOException {
194 if (reader != null) {
201 public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
206 line = reader.readLine();
209 throw new NoMoreDataException();
213 return getNextDocData(docData);
215 if (docDataLineReader == null) { // first line ever, one time initialization,
216 docDataLineReader = createDocDataLineReader(line);
217 if (skipHeaderLine) {
218 return getNextDocData(docData);
221 // increment IDS only once...
225 // The date String was written in the format of DateTools.dateToString.
228 docDataLineReader.parseLine(docData, line);
232 private LineParser createDocDataLineReader(String line) {
234 String headIndicator = WriteLineDocTask.FIELDS_HEADER_INDICATOR + WriteLineDocTask.SEP;
236 if (line.startsWith(headIndicator)) {
237 header = line.substring(headIndicator.length()).split(Character.toString(WriteLineDocTask.SEP));
238 skipHeaderLine = true; // mark to skip the header line when input file is reopened
240 header = WriteLineDocTask.DEFAULT_FIELDS;
243 // if a specific DocDataLineReader was configured, must respect it
244 String docDataLineReaderClassName = getConfig().get("line.parser", null);
245 if (docDataLineReaderClassName!=null) {
247 final Class<? extends LineParser> clazz =
248 Class.forName(docDataLineReaderClassName).asSubclass(LineParser.class);
249 Constructor<? extends LineParser> cnstr = clazz.getConstructor(new Class[]{String[].class});
250 return cnstr.newInstance((Object)header);
251 } catch (Exception e) {
252 throw new RuntimeException("Failed to instantiate "+docDataLineReaderClassName, e);
256 // if this the simple case,
257 if (Arrays.deepEquals(header, WriteLineDocTask.DEFAULT_FIELDS)) {
258 return new SimpleLineParser(header);
260 return new HeaderLineParser(header);
264 public void resetInputs() throws IOException {
270 public void setConfig(Config config) {
271 super.setConfig(config);
272 String fileName = config.get("docs.file", null);
273 if (fileName == null) {
274 throw new IllegalArgumentException("docs.file must be set");
276 file = new File(fileName).getAbsoluteFile();
277 if (encoding == null) {