lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java

   1 package org.apache.lucene.benchmark.byTask.feeds;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.UnsupportedEncodingException;
  22 import java.util.HashMap;
  23 import java.util.Calendar;
  24 import java.util.Map;
  25 import java.util.Properties;
  26 import java.util.Locale;
  27 import java.util.Random;
  28 import java.util.Date;
  29 import java.util.concurrent.atomic.AtomicInteger;
  30 import java.text.SimpleDateFormat;
  31 import java.text.ParsePosition;
  32
  33 import org.apache.lucene.benchmark.byTask.utils.Config;
  34 import org.apache.lucene.benchmark.byTask.utils.Format;
  35 import org.apache.lucene.document.Document;
  36 import org.apache.lucene.document.Field;
  37 import org.apache.lucene.document.NumericField;
  38 import org.apache.lucene.document.Field.Index;
  39 import org.apache.lucene.document.Field.Store;
  40 import org.apache.lucene.document.Field.TermVector;
  41
  42 /**
  43  * Creates {@link Document} objects. Uses a {@link ContentSource} to generate
  44  * {@link DocData} objects. Supports the following parameters:
  45  * <ul>
  46  * <li><b>content.source</b> - specifies the {@link ContentSource} class to use
  47  * (default <b>SingleDocSource</b>).
  48  * <li><b>doc.stored</b> - specifies whether fields should be stored (default
  49  * <b>false</b>).
  50  * <li><b>doc.body.stored</b> - specifies whether the body field should be stored (default
  51  * = <b>doc.stored</b>).
  52  * <li><b>doc.tokenized</b> - specifies whether fields should be tokenized
  53  * (default <b>true</b>).
  54  * <li><b>doc.body.tokenized</b> - specifies whether the
  55  * body field should be tokenized (default = <b>doc.tokenized</b>).
  56  * <li><b>doc.tokenized.norms</b> - specifies whether norms should be stored in
  57  * the index or not. (default <b>false</b>).
  58  * <li><b>doc.body.tokenized.norms</b> - specifies whether norms should be
  59  * stored in the index for the body field. This can be set to true, while
  60  * <code>doc.tokenized.norms</code> is set to false, to allow norms storing just
  61  * for the body field. (default <b>true</b>).
  62  * <li><b>doc.term.vector</b> - specifies whether term vectors should be stored
  63  * for fields (default <b>false</b>).
  64  * <li><b>doc.term.vector.positions</b> - specifies whether term vectors should
  65  * be stored with positions (default <b>false</b>).
  66  * <li><b>doc.term.vector.offsets</b> - specifies whether term vectors should be
  67  * stored with offsets (default <b>false</b>).
  68  * <li><b>doc.store.body.bytes</b> - specifies whether to store the raw bytes of
  69  * the document's content in the document (default <b>false</b>).
  70  * <li><b>doc.reuse.fields</b> - specifies whether Field and Document objects
  71  * should be reused (default <b>true</b>).
  72  * <li><b>doc.index.props</b> - specifies whether the properties returned by
  73  * <li><b>doc.random.id.limit</b> - if specified, docs will be assigned random
  74  * IDs from 0 to this limit.  This is useful with UpdateDoc
  75  * for testing performance of IndexWriter.updateDocument.
  76  * {@link DocData#getProps()} will be indexed. (default <b>false</b>).
  77  * </ul>
  78  */
  79 public class DocMaker {
  80
  81   private static class LeftOver {
  82     private DocData docdata;
  83     private int cnt;
  84   }
  85
  86   private Random r;
  87   private int updateDocIDLimit;
  88
  89   static class DocState {
  90
  91     private final Map<String,Field> fields;
  92     private final Map<String,NumericField> numericFields;
  93     private final boolean reuseFields;
  94     final Document doc;
  95     DocData docData = new DocData();
  96
  97     public DocState(boolean reuseFields, Store store, Store bodyStore, Index index, Index bodyIndex, TermVector termVector) {
  98
  99       this.reuseFields = reuseFields;
 100
 101       if (reuseFields) {
 102         fields =  new HashMap<String,Field>();
 103         numericFields = new HashMap<String,NumericField>();
 104
 105         // Initialize the map with the default fields.
 106         fields.put(BODY_FIELD, new Field(BODY_FIELD, "", bodyStore, bodyIndex, termVector));
 107         fields.put(TITLE_FIELD, new Field(TITLE_FIELD, "", store, index, termVector));
 108         fields.put(DATE_FIELD, new Field(DATE_FIELD, "", store, index, termVector));
 109         fields.put(ID_FIELD, new Field(ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
 110         fields.put(NAME_FIELD, new Field(NAME_FIELD, "", store, index, termVector));
 111
 112         numericFields.put(DATE_MSEC_FIELD, new NumericField(DATE_MSEC_FIELD));
 113         numericFields.put(TIME_SEC_FIELD, new NumericField(TIME_SEC_FIELD));
 114
 115         doc = new Document();
 116       } else {
 117         numericFields = null;
 118         fields = null;
 119         doc = null;
 120       }
 121     }
 122
 123     /**
 124      * Returns a field corresponding to the field name. If
 125      * <code>reuseFields</code> was set to true, then it attempts to reuse a
 126      * Field instance. If such a field does not exist, it creates a new one.
 127      */
 128     Field getField(String name, Store store, Index index, TermVector termVector) {
 129       if (!reuseFields) {
 130         return new Field(name, "", store, index, termVector);
 131       }
 132
 133       Field f = fields.get(name);
 134       if (f == null) {
 135         f = new Field(name, "", store, index, termVector);
 136         fields.put(name, f);
 137       }
 138       return f;
 139     }
 140
 141     NumericField getNumericField(String name) {
 142       if (!reuseFields) {
 143         return new NumericField(name);
 144       }
 145
 146       NumericField f = numericFields.get(name);
 147       if (f == null) {
 148         f = new NumericField(name);
 149         numericFields.put(name, f);
 150       }
 151       return f;
 152     }
 153   }
 154
 155   private boolean storeBytes = false;
 156
 157   private static class DateUtil {
 158     public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.US);
 159     public Calendar cal = Calendar.getInstance();
 160     public ParsePosition pos = new ParsePosition(0);
 161     public DateUtil() {
 162       parser.setLenient(true);
 163     }
 164   }
 165
 166   // leftovers are thread local, because it is unsafe to share residues between threads
 167   private ThreadLocal<LeftOver> leftovr = new ThreadLocal<LeftOver>();
 168   private ThreadLocal<DocState> docState = new ThreadLocal<DocState>();
 169   private ThreadLocal<DateUtil> dateParsers = new ThreadLocal<DateUtil>();
 170
 171   public static final String BODY_FIELD = "body";
 172   public static final String TITLE_FIELD = "doctitle";
 173   public static final String DATE_FIELD = "docdate";
 174   public static final String DATE_MSEC_FIELD = "docdatenum";
 175   public static final String TIME_SEC_FIELD = "doctimesecnum";
 176   public static final String ID_FIELD = "docid";
 177   public static final String BYTES_FIELD = "bytes";
 178   public static final String NAME_FIELD = "docname";
 179
 180   protected Config config;
 181
 182   protected Store storeVal = Store.NO;
 183   protected Store bodyStoreVal = Store.NO;
 184   protected Index indexVal = Index.ANALYZED_NO_NORMS;
 185   protected Index bodyIndexVal = Index.ANALYZED;
 186   protected TermVector termVecVal = TermVector.NO;
 187
 188   protected ContentSource source;
 189   protected boolean reuseFields;
 190   protected boolean indexProperties;
 191
 192   private int lastPrintedNumUniqueTexts = 0;
 193
 194   private long lastPrintedNumUniqueBytes = 0;
 195   private final AtomicInteger numDocsCreated = new AtomicInteger();
 196
 197   private int printNum = 0;
 198
 199   // create a doc
 200   // use only part of the body, modify it to keep the rest (or use all if size==0).
 201   // reset the docdata properties so they are not added more than once.
 202   private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException {
 203
 204     final DocState ds = getDocState();
 205     final Document doc = reuseFields ? ds.doc : new Document();
 206     doc.getFields().clear();
 207
 208     // Set ID_FIELD
 209     Field idField = ds.getField(ID_FIELD, storeVal, Index.NOT_ANALYZED_NO_NORMS, termVecVal);
 210     int id;
 211     if (r != null) {
 212       id = r.nextInt(updateDocIDLimit);
 213     } else {
 214       id = docData.getID();
 215       if (id == -1) {
 216         id = numDocsCreated.getAndIncrement();
 217       }
 218     }
 219     idField.setValue(Integer.toString(id));
 220     doc.add(idField);
 221
 222     // Set NAME_FIELD
 223     String name = docData.getName();
 224     if (name == null) name = "";
 225     name = cnt < 0 ? name : name + "_" + cnt;
 226     Field nameField = ds.getField(NAME_FIELD, storeVal, indexVal, termVecVal);
 227     nameField.setValue(name);
 228     doc.add(nameField);
 229
 230     // Set DATE_FIELD
 231     DateUtil util = dateParsers.get();
 232     if (util == null) {
 233       util = new DateUtil();
 234       dateParsers.set(util);
 235     }
 236     Date date = null;
 237     String dateString = docData.getDate();
 238     if (dateString != null) {
 239       util.pos.setIndex(0);
 240       date = util.parser.parse(dateString, util.pos);
 241       //System.out.println(dateString + " parsed to " + date);
 242     } else {
 243       dateString = "";
 244     }
 245     Field dateStringField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
 246     dateStringField.setValue(dateString);
 247     doc.add(dateStringField);
 248
 249     if (date == null) {
 250       // just set to right now
 251       date = new Date();
 252     }
 253
 254     NumericField dateField = ds.getNumericField(DATE_MSEC_FIELD);
 255     dateField.setLongValue(date.getTime());
 256     doc.add(dateField);
 257
 258     util.cal.setTime(date);
 259     final int sec = util.cal.get(Calendar.HOUR_OF_DAY)*3600 + util.cal.get(Calendar.MINUTE)*60 + util.cal.get(Calendar.SECOND);
 260
 261     NumericField timeSecField = ds.getNumericField(TIME_SEC_FIELD);
 262     timeSecField.setIntValue(sec);
 263     doc.add(timeSecField);
 264
 265     // Set TITLE_FIELD
 266     String title = docData.getTitle();
 267     Field titleField = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal);
 268     titleField.setValue(title == null ? "" : title);
 269     doc.add(titleField);
 270
 271     String body = docData.getBody();
 272     if (body != null && body.length() > 0) {
 273       String bdy;
 274       if (size <= 0 || size >= body.length()) {
 275         bdy = body; // use all
 276         docData.setBody(""); // nothing left
 277       } else {
 278         // attempt not to break words - if whitespace found within next 20 chars...
 279         for (int n = size - 1; n < size + 20 && n < body.length(); n++) {
 280           if (Character.isWhitespace(body.charAt(n))) {
 281             size = n;
 282             break;
 283           }
 284         }
 285         bdy = body.substring(0, size); // use part
 286         docData.setBody(body.substring(size)); // some left
 287       }
 288       Field bodyField = ds.getField(BODY_FIELD, bodyStoreVal, bodyIndexVal, termVecVal);
 289       bodyField.setValue(bdy);
 290       doc.add(bodyField);
 291
 292       if (storeBytes) {
 293         Field bytesField = ds.getField(BYTES_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
 294         bytesField.setValue(bdy.getBytes("UTF-8"));
 295         doc.add(bytesField);
 296       }
 297     }
 298
 299     if (indexProperties) {
 300       Properties props = docData.getProps();
 301       if (props != null) {
 302         for (final Map.Entry<Object,Object> entry : props.entrySet()) {
 303           Field f = ds.getField((String) entry.getKey(), storeVal, indexVal, termVecVal);
 304           f.setValue((String) entry.getValue());
 305           doc.add(f);
 306         }
 307         docData.setProps(null);
 308       }
 309     }
 310
 311     //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
 312     return doc;
 313   }
 314
 315   private void resetLeftovers() {
 316     leftovr.set(null);
 317   }
 318
 319   protected DocState getDocState() {
 320     DocState ds = docState.get();
 321     if (ds == null) {
 322       ds = new DocState(reuseFields, storeVal, bodyStoreVal, indexVal, bodyIndexVal, termVecVal);
 323       docState.set(ds);
 324     }
 325     return ds;
 326   }
 327
 328   /**
 329    * Closes the {@link DocMaker}. The base implementation closes the
 330    * {@link ContentSource}, and it can be overridden to do more work (but make
 331    * sure to call super.close()).
 332    */
 333   public void close() throws IOException {
 334     source.close();
 335   }
 336
 337   /**
 338    * Returns the number of bytes generated by the content source since last
 339    * reset.
 340    */
 341   public synchronized long getBytesCount() {
 342     return source.getBytesCount();
 343   }
 344
 345   /**
 346    * Returns the total number of bytes that were generated by the content source
 347    * defined to that doc maker.
 348    */
 349   public long getTotalBytesCount() {
 350     return source.getTotalBytesCount();
 351   }
 352
 353   /**
 354    * Creates a {@link Document} object ready for indexing. This method uses the
 355    * {@link ContentSource} to get the next document from the source, and creates
 356    * a {@link Document} object from the returned fields. If
 357    * <code>reuseFields</code> was set to true, it will reuse {@link Document}
 358    * and {@link Field} instances.
 359    */
 360   public Document makeDocument() throws Exception {
 361     resetLeftovers();
 362     DocData docData = source.getNextDocData(getDocState().docData);
 363     Document doc = createDocument(docData, 0, -1);
 364     return doc;
 365   }
 366
 367   /**
 368    * Same as {@link #makeDocument()}, only this method creates a document of the
 369    * given size input by <code>size</code>.
 370    */
 371   public Document makeDocument(int size) throws Exception {
 372     LeftOver lvr = leftovr.get();
 373     if (lvr == null || lvr.docdata == null || lvr.docdata.getBody() == null
 374         || lvr.docdata.getBody().length() == 0) {
 375       resetLeftovers();
 376     }
 377     DocData docData = getDocState().docData;
 378     DocData dd = (lvr == null ? source.getNextDocData(docData) : lvr.docdata);
 379     int cnt = (lvr == null ? 0 : lvr.cnt);
 380     while (dd.getBody() == null || dd.getBody().length() < size) {
 381       DocData dd2 = dd;
 382       dd = source.getNextDocData(new DocData());
 383       cnt = 0;
 384       dd.setBody(dd2.getBody() + dd.getBody());
 385     }
 386     Document doc = createDocument(dd, size, cnt);
 387     if (dd.getBody() == null || dd.getBody().length() == 0) {
 388       resetLeftovers();
 389     } else {
 390       if (lvr == null) {
 391         lvr = new LeftOver();
 392         leftovr.set(lvr);
 393       }
 394       lvr.docdata = dd;
 395       lvr.cnt = ++cnt;
 396     }
 397     return doc;
 398   }
 399
 400   public void printDocStatistics() {
 401     boolean print = false;
 402     String col = "                  ";
 403     StringBuilder sb = new StringBuilder();
 404     String newline = System.getProperty("line.separator");
 405     sb.append("------------> ").append(getClass().getSimpleName()).append(" statistics (").append(printNum).append("): ").append(newline);
 406     int nut = source.getTotalDocsCount();
 407     if (nut > lastPrintedNumUniqueTexts) {
 408       print = true;
 409       sb.append("total count of unique texts: ").append(Format.format(0,nut,col)).append(newline);
 410       lastPrintedNumUniqueTexts = nut;
 411     }
 412     long nub = getTotalBytesCount();
 413     if (nub > lastPrintedNumUniqueBytes) {
 414       print = true;
 415       sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline);
 416       lastPrintedNumUniqueBytes = nub;
 417     }
 418     if (source.getDocsCount() > 0) {
 419       print = true;
 420       sb.append("num docs added since last inputs reset:   ").append(Format.format(0,source.getDocsCount(),col)).append(newline);
 421       sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getBytesCount(),col)).append(newline);
 422     }
 423     if (print) {
 424       System.out.println(sb.append(newline).toString());
 425       printNum++;
 426     }
 427   }
 428
 429   /** Reset inputs so that the test run would behave, input wise, as if it just started. */
 430   public synchronized void resetInputs() throws IOException {
 431     printDocStatistics();
 432     // re-initiate since properties by round may have changed.
 433     setConfig(config);
 434     source.resetInputs();
 435     numDocsCreated.set(0);
 436     resetLeftovers();
 437   }
 438
 439   /** Set the configuration parameters of this doc maker. */
 440   public void setConfig(Config config) {
 441     this.config = config;
 442     try {
 443       String sourceClass = config.get("content.source", "org.apache.lucene.benchmark.byTask.feeds.SingleDocSource");
 444       source = Class.forName(sourceClass).asSubclass(ContentSource.class).newInstance();
 445       source.setConfig(config);
 446     } catch (Exception e) {
 447       // Should not get here. Throw runtime exception.
 448       throw new RuntimeException(e);
 449     }
 450
 451     boolean stored = config.get("doc.stored", false);
 452     boolean bodyStored = config.get("doc.body.stored", stored);
 453     boolean tokenized = config.get("doc.tokenized", true);
 454     boolean bodyTokenized = config.get("doc.body.tokenized", tokenized);
 455     boolean norms = config.get("doc.tokenized.norms", false);
 456     boolean bodyNorms = config.get("doc.body.tokenized.norms", true);
 457     boolean termVec = config.get("doc.term.vector", false);
 458     storeVal = (stored ? Field.Store.YES : Field.Store.NO);
 459     bodyStoreVal = (bodyStored ? Field.Store.YES : Field.Store.NO);
 460     if (tokenized) {
 461       indexVal = norms ? Index.ANALYZED : Index.ANALYZED_NO_NORMS;
 462     } else {
 463       indexVal = norms ? Index.NOT_ANALYZED : Index.NOT_ANALYZED_NO_NORMS;
 464     }
 465
 466     if (bodyTokenized) {
 467       bodyIndexVal = bodyNorms ? Index.ANALYZED : Index.ANALYZED_NO_NORMS;
 468     } else {
 469       bodyIndexVal = bodyNorms ? Index.NOT_ANALYZED : Index.NOT_ANALYZED_NO_NORMS;
 470     }
 471
 472     boolean termVecPositions = config.get("doc.term.vector.positions", false);
 473     boolean termVecOffsets = config.get("doc.term.vector.offsets", false);
 474     if (termVecPositions && termVecOffsets) {
 475       termVecVal = TermVector.WITH_POSITIONS_OFFSETS;
 476     } else if (termVecPositions) {
 477       termVecVal = TermVector.WITH_POSITIONS;
 478     } else if (termVecOffsets) {
 479       termVecVal = TermVector.WITH_OFFSETS;
 480     } else if (termVec) {
 481       termVecVal = TermVector.YES;
 482     } else {
 483       termVecVal = TermVector.NO;
 484     }
 485     storeBytes = config.get("doc.store.body.bytes", false);
 486
 487     reuseFields = config.get("doc.reuse.fields", true);
 488
 489     // In a multi-rounds run, it is important to reset DocState since settings
 490     // of fields may change between rounds, and this is the only way to reset
 491     // the cache of all threads.
 492     docState = new ThreadLocal<DocState>();
 493
 494     indexProperties = config.get("doc.index.props", false);
 495
 496     updateDocIDLimit = config.get("doc.random.id.limit", -1);
 497     if (updateDocIDLimit != -1) {
 498       r = new Random(179);
 499     }
 500   }
 501
 502 }