lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java

   1 package org.apache.lucene.benchmark.byTask.feeds;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.BufferedReader;
  21 import java.io.File;
  22 import java.io.IOException;
  23 import java.io.InputStream;
  24 import java.io.InputStreamReader;
  25 import java.lang.reflect.Constructor;
  26 import java.util.Arrays;
  27 import java.util.Properties;
  28
  29 import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
  30 import org.apache.lucene.benchmark.byTask.utils.Config;
  31 import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
  32
  33 /**
  34  * A {@link ContentSource} reading one line at a time as a
  35  * {@link org.apache.lucene.document.Document} from a single file. This saves IO
  36  * cost (over DirContentSource) of recursing through a directory and opening a
  37  * new file for every document.<br>
  38  * The expected format of each line is (arguments are separated by &lt;TAB&gt;):
  39  * <i>title, date, body</i>. If a line is read in a different format, a
  40  * {@link RuntimeException} will be thrown. In general, you should use this
  41  * content source for files that were created with {@link WriteLineDocTask}.<br>
  42  * <br>
  43  * Config properties:
  44  * <ul>
  45  * <li>docs.file=&lt;path to the file&gt;
  46  * <li>content.source.encoding - default to UTF-8.
  47  * <li>line.parser - default to {@link HeaderLineParser} if a header line exists which differs
  48  *     from {@link WriteLineDocTask#DEFAULT_FIELDS} and to {@link SimpleLineParser} otherwise.
  49  * </ul>
  50  */
  51 public class LineDocSource extends ContentSource {
  52
  53   /** Reader of a single input line into {@link DocData}. */
  54   public static abstract class LineParser {
  55     protected final String[] header;
  56     /** Construct with the header
  57      * @param header header line found in the input file, or null if none
  58      */
  59     public LineParser(String[] header) {
  60       this.header = header;
  61     }
  62     /** parse an input line and fill doc data appropriately */
  63     public abstract void parseLine(DocData docData, String line);
  64   }
  65
  66   /**
  67    * {@link LineParser} which ignores the header passed to its constructor
  68    * and assumes simply that field names and their order are the same
  69    * as in {@link WriteLineDocTask#DEFAULT_FIELDS}
  70    */
  71   public static class SimpleLineParser extends LineParser {
  72     public SimpleLineParser(String[] header) {
  73       super(header);
  74     }
  75     @Override
  76     public void parseLine(DocData docData, String line) {
  77       int k1 = 0;
  78       int k2 = line.indexOf(WriteLineDocTask.SEP, k1);
  79       if (k2<0) {
  80         throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator title::date)!");
  81       }
  82       docData.setTitle(line.substring(k1,k2));
  83       k1 = k2+1;
  84       k2 = line.indexOf(WriteLineDocTask.SEP, k1);
  85       if (k2<0) {
  86         throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator date::body)!");
  87       }
  88       docData.setDate(line.substring(k1,k2));
  89       k1 = k2+1;
  90       k2 = line.indexOf(WriteLineDocTask.SEP, k1);
  91       if (k2>=0) {
  92         throw new RuntimeException("line: [" + line + "] is in an invalid format (too many separators)!");
  93       }
  94       // last one
  95       docData.setBody(line.substring(k1));
  96     }
  97   }
  98
  99   /**
 100    * {@link LineParser} which sets field names and order by
 101    * the header - any header - of the lines file.
 102    * It is less efficient than {@link SimpleLineParser} but more powerful.
 103    */
 104   public static class HeaderLineParser extends LineParser {
 105     private enum FieldName { NAME , TITLE , DATE , BODY, PROP }
 106     private final FieldName[] posToF;
 107     public HeaderLineParser(String[] header) {
 108       super(header);
 109       posToF = new FieldName[header.length];
 110       for (int i=0; i<header.length; i++) {
 111         String f = header[i];
 112         if (DocMaker.NAME_FIELD.equals(f)) {
 113           posToF[i] = FieldName.NAME;
 114         } else if (DocMaker.TITLE_FIELD.equals(f)) {
 115           posToF[i] = FieldName.TITLE;
 116         } else if (DocMaker.DATE_FIELD.equals(f)) {
 117           posToF[i] = FieldName.DATE;
 118         } else if (DocMaker.BODY_FIELD.equals(f)) {
 119           posToF[i] = FieldName.BODY;
 120         } else {
 121           posToF[i] = FieldName.PROP;
 122         }
 123       }
 124     }
 125
 126     @Override
 127     public void parseLine(DocData docData, String line) {
 128       int n = 0;
 129       int k1 = 0;
 130       int k2;
 131       while ((k2 = line.indexOf(WriteLineDocTask.SEP, k1)) >= 0) {
 132         if (n>=header.length) {
 133           throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
 134         }
 135         setDocDataField(docData, n, line.substring(k1,k2));
 136         ++n;
 137         k1 = k2 + 1;
 138       }
 139       if (n!=header.length-1) {
 140         throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
 141       }
 142       // last one
 143       setDocDataField(docData, n, line.substring(k1));
 144     }
 145
 146     private void setDocDataField(DocData docData, int position, String text) {
 147       switch(posToF[position]) {
 148         case NAME:
 149           docData.setName(text);
 150           break;
 151         case TITLE:
 152           docData.setTitle(text);
 153           break;
 154         case DATE:
 155           docData.setDate(text);
 156           break;
 157         case BODY:
 158           docData.setBody(text);
 159           break;
 160         case PROP:
 161           Properties p = docData.getProps();
 162           if (p==null) {
 163             p = new Properties();
 164             docData.setProps(p);
 165           }
 166           p.setProperty(header[position], text);
 167           break;
 168       }
 169     }
 170   }
 171
 172   private File file;
 173   private BufferedReader reader;
 174   private int readCount;
 175
 176   private LineParser docDataLineReader = null;
 177   private boolean skipHeaderLine = false;
 178
 179   private synchronized void openFile() {
 180     try {
 181       if (reader != null) {
 182         reader.close();
 183       }
 184       InputStream is = StreamUtils.inputStream(file);
 185       reader = new BufferedReader(new InputStreamReader(is, encoding), StreamUtils.BUFFER_SIZE);
 186       if (skipHeaderLine) {
 187         reader.readLine(); // skip one line - the header line - already handled that info
 188       }
 189     } catch (IOException e) {
 190       throw new RuntimeException(e);
 191     }
 192   }
 193
 194   @Override
 195   public void close() throws IOException {
 196     if (reader != null) {
 197       reader.close();
 198       reader = null;
 199     }
 200   }
 201
 202   @Override
 203   public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
 204     final String line;
 205     final int myID;
 206
 207     synchronized(this) {
 208       line = reader.readLine();
 209       if (line == null) {
 210         if (!forever) {
 211           throw new NoMoreDataException();
 212         }
 213         // Reset the file
 214         openFile();
 215         return getNextDocData(docData);
 216       }
 217       if (docDataLineReader == null) { // first line ever, one time initialization,
 218         docDataLineReader = createDocDataLineReader(line);
 219         if (skipHeaderLine) {
 220           return getNextDocData(docData);
 221         }
 222       }
 223       // increment IDS only once...
 224       myID = readCount++;
 225     }
 226
 227     // The date String was written in the format of DateTools.dateToString.
 228     docData.clear();
 229     docData.setID(myID);
 230     docDataLineReader.parseLine(docData, line);
 231     return docData;
 232   }
 233
 234   private LineParser createDocDataLineReader(String line) {
 235     String[] header;
 236     String headIndicator = WriteLineDocTask.FIELDS_HEADER_INDICATOR + WriteLineDocTask.SEP;
 237
 238     if (line.startsWith(headIndicator)) {
 239       header = line.substring(headIndicator.length()).split(Character.toString(WriteLineDocTask.SEP));
 240       skipHeaderLine = true; // mark to skip the header line when input file is reopened
 241     } else {
 242       header = WriteLineDocTask.DEFAULT_FIELDS;
 243     }
 244
 245     // if a specific DocDataLineReader was configured, must respect it
 246     String docDataLineReaderClassName = getConfig().get("line.parser", null);
 247     if (docDataLineReaderClassName!=null) {
 248       try {
 249         final Class<? extends LineParser> clazz =
 250           Class.forName(docDataLineReaderClassName).asSubclass(LineParser.class);
 251         Constructor<? extends LineParser> cnstr = clazz.getConstructor(new Class[]{String[].class});
 252         return cnstr.newInstance((Object)header);
 253       } catch (Exception e) {
 254         throw new RuntimeException("Failed to instantiate "+docDataLineReaderClassName, e);
 255       }
 256     }
 257
 258     // if this the simple case,
 259     if (Arrays.deepEquals(header, WriteLineDocTask.DEFAULT_FIELDS)) {
 260       return new SimpleLineParser(header);
 261     }
 262     return new HeaderLineParser(header);
 263   }
 264
 265   @Override
 266   public void resetInputs() throws IOException {
 267     super.resetInputs();
 268     openFile();
 269   }
 270
 271   @Override
 272   public void setConfig(Config config) {
 273     super.setConfig(config);
 274     String fileName = config.get("docs.file", null);
 275     if (fileName == null) {
 276       throw new IllegalArgumentException("docs.file must be set");
 277     }
 278     file = new File(fileName).getAbsoluteFile();
 279     if (encoding == null) {
 280       encoding = "UTF-8";
 281     }
 282   }
 283
 284 }