1 package org.apache.lucene.benchmark.byTask.feeds;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.UnsupportedEncodingException;
22 import java.util.HashMap;
23 import java.util.Calendar;
25 import java.util.Properties;
26 import java.util.Locale;
27 import java.util.Random;
28 import java.util.Date;
29 import java.util.concurrent.atomic.AtomicInteger;
30 import java.text.SimpleDateFormat;
31 import java.text.ParsePosition;
33 import org.apache.lucene.benchmark.byTask.utils.Config;
34 import org.apache.lucene.benchmark.byTask.utils.Format;
35 import org.apache.lucene.document.Document;
36 import org.apache.lucene.document.Field;
37 import org.apache.lucene.document.NumericField;
38 import org.apache.lucene.document.Field.Index;
39 import org.apache.lucene.document.Field.Store;
40 import org.apache.lucene.document.Field.TermVector;
43 * Creates {@link Document} objects. Uses a {@link ContentSource} to generate
44 * {@link DocData} objects. Supports the following parameters:
46 * <li><b>content.source</b> - specifies the {@link ContentSource} class to use
47 * (default <b>SingleDocSource</b>).
48 * <li><b>doc.stored</b> - specifies whether fields should be stored (default
50 * <li><b>doc.body.stored</b> - specifies whether the body field should be stored (default
51 * = <b>doc.stored</b>).
52 * <li><b>doc.tokenized</b> - specifies whether fields should be tokenized
53 * (default <b>true</b>).
54 * <li><b>doc.body.tokenized</b> - specifies whether the
55 * body field should be tokenized (default = <b>doc.tokenized</b>).
56 * <li><b>doc.tokenized.norms</b> - specifies whether norms should be stored in
57 * the index or not. (default <b>false</b>).
58 * <li><b>doc.body.tokenized.norms</b> - specifies whether norms should be
59 * stored in the index for the body field. This can be set to true, while
60 * <code>doc.tokenized.norms</code> is set to false, to allow norms storing just
61 * for the body field. (default <b>true</b>).
62 * <li><b>doc.term.vector</b> - specifies whether term vectors should be stored
63 * for fields (default <b>false</b>).
64 * <li><b>doc.term.vector.positions</b> - specifies whether term vectors should
65 * be stored with positions (default <b>false</b>).
66 * <li><b>doc.term.vector.offsets</b> - specifies whether term vectors should be
67 * stored with offsets (default <b>false</b>).
68 * <li><b>doc.store.body.bytes</b> - specifies whether to store the raw bytes of
69 * the document's content in the document (default <b>false</b>).
70 * <li><b>doc.reuse.fields</b> - specifies whether Field and Document objects
71 * should be reused (default <b>true</b>).
72 * <li><b>doc.index.props</b> - specifies whether the properties returned by
73 * <li><b>doc.random.id.limit</b> - if specified, docs will be assigned random
74 * IDs from 0 to this limit. This is useful with UpdateDoc
75 * for testing performance of IndexWriter.updateDocument.
76 * {@link DocData#getProps()} will be indexed. (default <b>false</b>).
79 public class DocMaker {
81 private static class LeftOver {
82 private DocData docdata;
87 private int updateDocIDLimit;
89 static class DocState {
91 private final Map<String,Field> fields;
92 private final Map<String,NumericField> numericFields;
93 private final boolean reuseFields;
95 DocData docData = new DocData();
97 public DocState(boolean reuseFields, Store store, Store bodyStore, Index index, Index bodyIndex, TermVector termVector) {
99 this.reuseFields = reuseFields;
102 fields = new HashMap<String,Field>();
103 numericFields = new HashMap<String,NumericField>();
105 // Initialize the map with the default fields.
106 fields.put(BODY_FIELD, new Field(BODY_FIELD, "", bodyStore, bodyIndex, termVector));
107 fields.put(TITLE_FIELD, new Field(TITLE_FIELD, "", store, index, termVector));
108 fields.put(DATE_FIELD, new Field(DATE_FIELD, "", store, index, termVector));
109 fields.put(ID_FIELD, new Field(ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
110 fields.put(NAME_FIELD, new Field(NAME_FIELD, "", store, index, termVector));
112 numericFields.put(DATE_MSEC_FIELD, new NumericField(DATE_MSEC_FIELD));
113 numericFields.put(TIME_SEC_FIELD, new NumericField(TIME_SEC_FIELD));
115 doc = new Document();
117 numericFields = null;
124 * Returns a field corresponding to the field name. If
125 * <code>reuseFields</code> was set to true, then it attempts to reuse a
126 * Field instance. If such a field does not exist, it creates a new one.
128 Field getField(String name, Store store, Index index, TermVector termVector) {
130 return new Field(name, "", store, index, termVector);
133 Field f = fields.get(name);
135 f = new Field(name, "", store, index, termVector);
141 NumericField getNumericField(String name) {
143 return new NumericField(name);
146 NumericField f = numericFields.get(name);
148 f = new NumericField(name);
149 numericFields.put(name, f);
155 private boolean storeBytes = false;
157 private static class DateUtil {
158 public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.US);
159 public Calendar cal = Calendar.getInstance();
160 public ParsePosition pos = new ParsePosition(0);
162 parser.setLenient(true);
166 // leftovers are thread local, because it is unsafe to share residues between threads
167 private ThreadLocal<LeftOver> leftovr = new ThreadLocal<LeftOver>();
168 private ThreadLocal<DocState> docState = new ThreadLocal<DocState>();
169 private ThreadLocal<DateUtil> dateParsers = new ThreadLocal<DateUtil>();
171 public static final String BODY_FIELD = "body";
172 public static final String TITLE_FIELD = "doctitle";
173 public static final String DATE_FIELD = "docdate";
174 public static final String DATE_MSEC_FIELD = "docdatenum";
175 public static final String TIME_SEC_FIELD = "doctimesecnum";
176 public static final String ID_FIELD = "docid";
177 public static final String BYTES_FIELD = "bytes";
178 public static final String NAME_FIELD = "docname";
180 protected Config config;
182 protected Store storeVal = Store.NO;
183 protected Store bodyStoreVal = Store.NO;
184 protected Index indexVal = Index.ANALYZED_NO_NORMS;
185 protected Index bodyIndexVal = Index.ANALYZED;
186 protected TermVector termVecVal = TermVector.NO;
188 protected ContentSource source;
189 protected boolean reuseFields;
190 protected boolean indexProperties;
192 private int lastPrintedNumUniqueTexts = 0;
194 private long lastPrintedNumUniqueBytes = 0;
195 private final AtomicInteger numDocsCreated = new AtomicInteger();
197 private int printNum = 0;
200 // use only part of the body, modify it to keep the rest (or use all if size==0).
201 // reset the docdata properties so they are not added more than once.
202 private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException {
204 final DocState ds = getDocState();
205 final Document doc = reuseFields ? ds.doc : new Document();
206 doc.getFields().clear();
209 Field idField = ds.getField(ID_FIELD, storeVal, Index.NOT_ANALYZED_NO_NORMS, termVecVal);
212 id = r.nextInt(updateDocIDLimit);
214 id = docData.getID();
216 id = numDocsCreated.getAndIncrement();
219 idField.setValue(Integer.toString(id));
223 String name = docData.getName();
224 if (name == null) name = "";
225 name = cnt < 0 ? name : name + "_" + cnt;
226 Field nameField = ds.getField(NAME_FIELD, storeVal, indexVal, termVecVal);
227 nameField.setValue(name);
231 DateUtil util = dateParsers.get();
233 util = new DateUtil();
234 dateParsers.set(util);
237 String dateString = docData.getDate();
238 if (dateString != null) {
239 util.pos.setIndex(0);
240 date = util.parser.parse(dateString, util.pos);
241 //System.out.println(dateString + " parsed to " + date);
245 Field dateStringField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
246 dateStringField.setValue(dateString);
247 doc.add(dateStringField);
250 // just set to right now
254 NumericField dateField = ds.getNumericField(DATE_MSEC_FIELD);
255 dateField.setLongValue(date.getTime());
258 util.cal.setTime(date);
259 final int sec = util.cal.get(Calendar.HOUR_OF_DAY)*3600 + util.cal.get(Calendar.MINUTE)*60 + util.cal.get(Calendar.SECOND);
261 NumericField timeSecField = ds.getNumericField(TIME_SEC_FIELD);
262 timeSecField.setIntValue(sec);
263 doc.add(timeSecField);
266 String title = docData.getTitle();
267 Field titleField = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal);
268 titleField.setValue(title == null ? "" : title);
271 String body = docData.getBody();
272 if (body != null && body.length() > 0) {
274 if (size <= 0 || size >= body.length()) {
275 bdy = body; // use all
276 docData.setBody(""); // nothing left
278 // attempt not to break words - if whitespace found within next 20 chars...
279 for (int n = size - 1; n < size + 20 && n < body.length(); n++) {
280 if (Character.isWhitespace(body.charAt(n))) {
285 bdy = body.substring(0, size); // use part
286 docData.setBody(body.substring(size)); // some left
288 Field bodyField = ds.getField(BODY_FIELD, bodyStoreVal, bodyIndexVal, termVecVal);
289 bodyField.setValue(bdy);
293 Field bytesField = ds.getField(BYTES_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
294 bytesField.setValue(bdy.getBytes("UTF-8"));
299 if (indexProperties) {
300 Properties props = docData.getProps();
302 for (final Map.Entry<Object,Object> entry : props.entrySet()) {
303 Field f = ds.getField((String) entry.getKey(), storeVal, indexVal, termVecVal);
304 f.setValue((String) entry.getValue());
307 docData.setProps(null);
311 //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
315 private void resetLeftovers() {
319 protected DocState getDocState() {
320 DocState ds = docState.get();
322 ds = new DocState(reuseFields, storeVal, bodyStoreVal, indexVal, bodyIndexVal, termVecVal);
329 * Closes the {@link DocMaker}. The base implementation closes the
330 * {@link ContentSource}, and it can be overridden to do more work (but make
331 * sure to call super.close()).
333 public void close() throws IOException {
338 * Returns the number of bytes generated by the content source since last
341 public synchronized long getBytesCount() {
342 return source.getBytesCount();
346 * Returns the total number of bytes that were generated by the content source
347 * defined to that doc maker.
349 public long getTotalBytesCount() {
350 return source.getTotalBytesCount();
354 * Creates a {@link Document} object ready for indexing. This method uses the
355 * {@link ContentSource} to get the next document from the source, and creates
356 * a {@link Document} object from the returned fields. If
357 * <code>reuseFields</code> was set to true, it will reuse {@link Document}
358 * and {@link Field} instances.
360 public Document makeDocument() throws Exception {
362 DocData docData = source.getNextDocData(getDocState().docData);
363 Document doc = createDocument(docData, 0, -1);
368 * Same as {@link #makeDocument()}, only this method creates a document of the
369 * given size input by <code>size</code>.
371 public Document makeDocument(int size) throws Exception {
372 LeftOver lvr = leftovr.get();
373 if (lvr == null || lvr.docdata == null || lvr.docdata.getBody() == null
374 || lvr.docdata.getBody().length() == 0) {
377 DocData docData = getDocState().docData;
378 DocData dd = (lvr == null ? source.getNextDocData(docData) : lvr.docdata);
379 int cnt = (lvr == null ? 0 : lvr.cnt);
380 while (dd.getBody() == null || dd.getBody().length() < size) {
382 dd = source.getNextDocData(new DocData());
384 dd.setBody(dd2.getBody() + dd.getBody());
386 Document doc = createDocument(dd, size, cnt);
387 if (dd.getBody() == null || dd.getBody().length() == 0) {
391 lvr = new LeftOver();
400 public void printDocStatistics() {
401 boolean print = false;
403 StringBuilder sb = new StringBuilder();
404 String newline = System.getProperty("line.separator");
405 sb.append("------------> ").append(getClass().getSimpleName()).append(" statistics (").append(printNum).append("): ").append(newline);
406 int nut = source.getTotalDocsCount();
407 if (nut > lastPrintedNumUniqueTexts) {
409 sb.append("total count of unique texts: ").append(Format.format(0,nut,col)).append(newline);
410 lastPrintedNumUniqueTexts = nut;
412 long nub = getTotalBytesCount();
413 if (nub > lastPrintedNumUniqueBytes) {
415 sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline);
416 lastPrintedNumUniqueBytes = nub;
418 if (source.getDocsCount() > 0) {
420 sb.append("num docs added since last inputs reset: ").append(Format.format(0,source.getDocsCount(),col)).append(newline);
421 sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getBytesCount(),col)).append(newline);
424 System.out.println(sb.append(newline).toString());
429 /** Reset inputs so that the test run would behave, input wise, as if it just started. */
430 public synchronized void resetInputs() throws IOException {
431 printDocStatistics();
432 // re-initiate since properties by round may have changed.
434 source.resetInputs();
435 numDocsCreated.set(0);
439 /** Set the configuration parameters of this doc maker. */
440 public void setConfig(Config config) {
441 this.config = config;
443 String sourceClass = config.get("content.source", "org.apache.lucene.benchmark.byTask.feeds.SingleDocSource");
444 source = Class.forName(sourceClass).asSubclass(ContentSource.class).newInstance();
445 source.setConfig(config);
446 } catch (Exception e) {
447 // Should not get here. Throw runtime exception.
448 throw new RuntimeException(e);
451 boolean stored = config.get("doc.stored", false);
452 boolean bodyStored = config.get("doc.body.stored", stored);
453 boolean tokenized = config.get("doc.tokenized", true);
454 boolean bodyTokenized = config.get("doc.body.tokenized", tokenized);
455 boolean norms = config.get("doc.tokenized.norms", false);
456 boolean bodyNorms = config.get("doc.body.tokenized.norms", true);
457 boolean termVec = config.get("doc.term.vector", false);
458 storeVal = (stored ? Field.Store.YES : Field.Store.NO);
459 bodyStoreVal = (bodyStored ? Field.Store.YES : Field.Store.NO);
461 indexVal = norms ? Index.ANALYZED : Index.ANALYZED_NO_NORMS;
463 indexVal = norms ? Index.NOT_ANALYZED : Index.NOT_ANALYZED_NO_NORMS;
467 bodyIndexVal = bodyNorms ? Index.ANALYZED : Index.ANALYZED_NO_NORMS;
469 bodyIndexVal = bodyNorms ? Index.NOT_ANALYZED : Index.NOT_ANALYZED_NO_NORMS;
472 boolean termVecPositions = config.get("doc.term.vector.positions", false);
473 boolean termVecOffsets = config.get("doc.term.vector.offsets", false);
474 if (termVecPositions && termVecOffsets) {
475 termVecVal = TermVector.WITH_POSITIONS_OFFSETS;
476 } else if (termVecPositions) {
477 termVecVal = TermVector.WITH_POSITIONS;
478 } else if (termVecOffsets) {
479 termVecVal = TermVector.WITH_OFFSETS;
480 } else if (termVec) {
481 termVecVal = TermVector.YES;
483 termVecVal = TermVector.NO;
485 storeBytes = config.get("doc.store.body.bytes", false);
487 reuseFields = config.get("doc.reuse.fields", true);
489 // In a multi-rounds run, it is important to reset DocState since settings
490 // of fields may change between rounds, and this is the only way to reset
491 // the cache of all threads.
492 docState = new ThreadLocal<DocState>();
494 indexProperties = config.get("doc.index.props", false);
496 updateDocIDLimit = config.get("doc.random.id.limit", -1);
497 if (updateDocIDLimit != -1) {