lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java

   1 package org.apache.lucene.benchmark.byTask.feeds;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.BufferedReader;
  21 import java.io.File;
  22 import java.io.IOException;
  23 import java.io.InputStream;
  24 import java.io.InputStreamReader;
  25 import java.lang.reflect.Constructor;
  26 import java.util.Arrays;
  27 import java.util.Properties;
  28
  29 import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
  30 import org.apache.lucene.benchmark.byTask.utils.Config;
  31 import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
  32
  33 /**
  34  * A {@link ContentSource} reading one line at a time as a
  35  * {@link org.apache.lucene.document.Document} from a single file. This saves IO
  36  * cost (over DirContentSource) of recursing through a directory and opening a
  37  * new file for every document.<br>
  38  * The expected format of each line is (arguments are separated by &lt;TAB&gt;):
  39  * <i>title, date, body</i>. If a line is read in a different format, a
  40  * {@link RuntimeException} will be thrown. In general, you should use this
  41  * content source for files that were created with {@link WriteLineDocTask}.<br>
  42  * <br>
  43  * Config properties:
  44  * <ul>
  45  * <li>docs.file=&lt;path to the file&gt;
  46  * <li>content.source.encoding - default to UTF-8.
  47  * <li>line.parser - default to {@link HeaderLineParser} if a header line exists which differs
  48  *     from {@link WriteLineDocTask#DEFAULT_FIELDS} and to {@link SimpleLineParser} otherwise.
  49  * </ul>
  50  */
  51 public class LineDocSource extends ContentSource {
  52
  53   /** Reader of a single input line into {@link DocData}. */
  54   public static abstract class LineParser {
  55     protected final String[] header;
  56     /** Construct with the header
  57      * @param header header line found in the input file, or null if none
  58      */
  59     public LineParser(String[] header) {
  60       this.header = header;
  61     }
  62     /** parse an input line and fill doc data appropriately */
  63     public abstract void parseLine(DocData docData, String line);
  64   }
  65
  66   /**
  67    * {@link LineParser} which ignores the header passed to its constructor
  68    * and assumes simply that field names and their order are the same
  69    * as in {@link WriteLineDocTask#DEFAULT_FIELDS}
  70    */
  71   public static class SimpleLineParser extends LineParser {
  72     public SimpleLineParser(String[] header) {
  73       super(header);
  74     }
  75     public void parseLine(DocData docData, String line) {
  76       int k1 = 0;
  77       int k2 = line.indexOf(WriteLineDocTask.SEP, k1);
  78       if (k2<0) {
  79         throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator title::date)!");
  80       }
  81       docData.setTitle(line.substring(k1,k2));
  82       k1 = k2+1;
  83       k2 = line.indexOf(WriteLineDocTask.SEP, k1);
  84       if (k2<0) {
  85         throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator date::body)!");
  86       }
  87       docData.setDate(line.substring(k1,k2));
  88       k1 = k2+1;
  89       k2 = line.indexOf(WriteLineDocTask.SEP, k1);
  90       if (k2>=0) {
  91         throw new RuntimeException("line: [" + line + "] is in an invalid format (too many separators)!");
  92       }
  93       // last one
  94       docData.setBody(line.substring(k1));
  95     }
  96   }
  97
  98   /**
  99    * {@link LineParser} which sets field names and order by
 100    * the header - any header - of the lines file.
 101    * It is less efficient than {@link SimpleLineParser} but more powerful.
 102    */
 103   public static class HeaderLineParser extends LineParser {
 104     private enum FieldName { NAME , TITLE , DATE , BODY, PROP }
 105     private final FieldName[] posToF;
 106     public HeaderLineParser(String[] header) {
 107       super(header);
 108       posToF = new FieldName[header.length];
 109       for (int i=0; i<header.length; i++) {
 110         String f = header[i];
 111         if (DocMaker.NAME_FIELD.equals(f)) {
 112           posToF[i] = FieldName.NAME;
 113         } else if (DocMaker.TITLE_FIELD.equals(f)) {
 114           posToF[i] = FieldName.TITLE;
 115         } else if (DocMaker.DATE_FIELD.equals(f)) {
 116           posToF[i] = FieldName.DATE;
 117         } else if (DocMaker.BODY_FIELD.equals(f)) {
 118           posToF[i] = FieldName.BODY;
 119         } else {
 120           posToF[i] = FieldName.PROP;
 121         }
 122       }
 123     }
 124
 125     public void parseLine(DocData docData, String line) {
 126       int n = 0;
 127       int k1 = 0;
 128       int k2;
 129       while ((k2 = line.indexOf(WriteLineDocTask.SEP, k1)) >= 0) {
 130         if (n>=header.length) {
 131           throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
 132         }
 133         setDocDataField(docData, n, line.substring(k1,k2));
 134         ++n;
 135         k1 = k2 + 1;
 136       }
 137       if (n!=header.length-1) {
 138         throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
 139       }
 140       // last one
 141       setDocDataField(docData, n, line.substring(k1));
 142     }
 143
 144     private void setDocDataField(DocData docData, int position, String text) {
 145       switch(posToF[position]) {
 146         case NAME:
 147           docData.setName(text);
 148           break;
 149         case TITLE:
 150           docData.setTitle(text);
 151           break;
 152         case DATE:
 153           docData.setDate(text);
 154           break;
 155         case BODY:
 156           docData.setBody(text);
 157           break;
 158         case PROP:
 159           Properties p = docData.getProps();
 160           if (p==null) {
 161             p = new Properties();
 162             docData.setProps(p);
 163           }
 164           p.setProperty(header[position], text);
 165           break;
 166       }
 167     }
 168   }
 169
 170   private File file;
 171   private BufferedReader reader;
 172   private int readCount;
 173
 174   private LineParser docDataLineReader = null;
 175   private boolean skipHeaderLine = false;
 176
 177   private synchronized void openFile() {
 178     try {
 179       if (reader != null) {
 180         reader.close();
 181       }
 182       InputStream is = StreamUtils.inputStream(file);
 183       reader = new BufferedReader(new InputStreamReader(is, encoding), StreamUtils.BUFFER_SIZE);
 184       if (skipHeaderLine) {
 185         reader.readLine(); // skip one line - the header line - already handled that info
 186       }
 187     } catch (IOException e) {
 188       throw new RuntimeException(e);
 189     }
 190   }
 191
 192   @Override
 193   public void close() throws IOException {
 194     if (reader != null) {
 195       reader.close();
 196       reader = null;
 197     }
 198   }
 199
 200   @Override
 201   public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
 202     final String line;
 203     final int myID;
 204
 205     synchronized(this) {
 206       line = reader.readLine();
 207       if (line == null) {
 208         if (!forever) {
 209           throw new NoMoreDataException();
 210         }
 211         // Reset the file
 212         openFile();
 213         return getNextDocData(docData);
 214       }
 215       if (docDataLineReader == null) { // first line ever, one time initialization,
 216         docDataLineReader = createDocDataLineReader(line);
 217         if (skipHeaderLine) {
 218           return getNextDocData(docData);
 219         }
 220       }
 221       // increment IDS only once...
 222       myID = readCount++;
 223     }
 224
 225     // The date String was written in the format of DateTools.dateToString.
 226     docData.clear();
 227     docData.setID(myID);
 228     docDataLineReader.parseLine(docData, line);
 229     return docData;
 230   }
 231
 232   private LineParser createDocDataLineReader(String line) {
 233     String[] header;
 234     String headIndicator = WriteLineDocTask.FIELDS_HEADER_INDICATOR + WriteLineDocTask.SEP;
 235
 236     if (line.startsWith(headIndicator)) {
 237       header = line.substring(headIndicator.length()).split(Character.toString(WriteLineDocTask.SEP));
 238       skipHeaderLine = true; // mark to skip the header line when input file is reopened
 239     } else {
 240       header = WriteLineDocTask.DEFAULT_FIELDS;
 241     }
 242
 243     // if a specific DocDataLineReader was configured, must respect it
 244     String docDataLineReaderClassName = getConfig().get("line.parser", null);
 245     if (docDataLineReaderClassName!=null) {
 246       try {
 247         final Class<? extends LineParser> clazz =
 248           Class.forName(docDataLineReaderClassName).asSubclass(LineParser.class);
 249         Constructor<? extends LineParser> cnstr = clazz.getConstructor(new Class[]{String[].class});
 250         return cnstr.newInstance((Object)header);
 251       } catch (Exception e) {
 252         throw new RuntimeException("Failed to instantiate "+docDataLineReaderClassName, e);
 253       }
 254     }
 255
 256     // if this the simple case,
 257     if (Arrays.deepEquals(header, WriteLineDocTask.DEFAULT_FIELDS)) {
 258       return new SimpleLineParser(header);
 259     }
 260     return new HeaderLineParser(header);
 261   }
 262
 263   @Override
 264   public void resetInputs() throws IOException {
 265     super.resetInputs();
 266     openFile();
 267   }
 268
 269   @Override
 270   public void setConfig(Config config) {
 271     super.setConfig(config);
 272     String fileName = config.get("docs.file", null);
 273     if (fileName == null) {
 274       throw new IllegalArgumentException("docs.file must be set");
 275     }
 276     file = new File(fileName).getAbsoluteFile();
 277     if (encoding == null) {
 278       encoding = "UTF-8";
 279     }
 280   }
 281
 282 }