1 package org.apache.lucene.benchmark.byTask.feeds;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import java.io.IOException;
22 import java.util.ArrayList;
23 import java.util.Arrays;
25 import org.apache.lucene.benchmark.byTask.utils.Config;
28 * Represents content from a specified source, such as TREC, Reuters etc. A
29 * {@link ContentSource} is responsible for creating {@link DocData} objects for
30 * its documents to be consumed by {@link DocMaker}. It also keeps track
31 * of various statistics, such as how many documents were generated, size in
34 * Supports the following configuration parameters:
36 * <li><b>content.source.forever</b> - specifies whether to generate documents
37 * forever (<b>default=true</b>).
38 * <li><b>content.source.verbose</b> - specifies whether messages should be
39 * output by the content source (<b>default=false</b>).
40 * <li><b>content.source.encoding</b> - specifies which encoding to use when
41 * reading the files of that content source. Certain implementations may define
42 * a default value if this parameter is not specified. (<b>default=null</b>).
43 * <li><b>content.source.log.step</b> - specifies for how many documents a
44 * message should be logged. If set to 0 it means no logging should occur.
45 * <b>NOTE:</b> if verbose is set to false, logging should not occur even if
46 * logStep is not 0 (<b>default=0</b>).
49 public abstract class ContentSource {
51 private long bytesCount;
52 private long totalBytesCount;
53 private int docsCount;
54 private int totalDocsCount;
55 private Config config;
57 protected boolean forever;
58 protected int logStep;
59 protected boolean verbose;
60 protected String encoding;
62 /** update count of bytes generated by this source */
63 protected final synchronized void addBytes(long numBytes) {
64 bytesCount += numBytes;
65 totalBytesCount += numBytes;
68 /** update count of documents generated by this source */
69 protected final synchronized void addDoc() {
75 * A convenience method for collecting all the files of a content source from
76 * a given directory. The collected {@link File} instances are stored in the
77 * given <code>files</code>.
79 protected final void collectFiles(File dir, ArrayList<File> files) {
84 File[] dirFiles = dir.listFiles();
85 Arrays.sort(dirFiles);
86 for (int i = 0; i < dirFiles.length; i++) {
87 File file = dirFiles[i];
88 if (file.isDirectory()) {
89 collectFiles(file, files);
90 } else if (file.canRead()) {
97 * Returns true whether it's time to log a message (depending on verbose and
98 * the number of documents generated).
100 protected final boolean shouldLog() {
101 return verbose && logStep > 0 && docsCount % logStep == 0;
104 /** Called when reading from this content source is no longer required. */
105 public abstract void close() throws IOException;
107 /** Returns the number of bytes generated since last reset. */
108 public final long getBytesCount() { return bytesCount; }
110 /** Returns the number of generated documents since last reset. */
111 public final int getDocsCount() { return docsCount; }
113 public final Config getConfig() { return config; }
115 /** Returns the next {@link DocData} from the content source. */
116 public abstract DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException;
118 /** Returns the total number of bytes that were generated by this source. */
119 public final long getTotalBytesCount() { return totalBytesCount; }
121 /** Returns the total number of generated documents. */
122 public final int getTotalDocsCount() { return totalDocsCount; }
125 * Resets the input for this content source, so that the test would behave as
126 * if it was just started, input-wise.
128 * <b>NOTE:</b> the default implementation resets the number of bytes and
129 * documents generated since the last reset, so it's important to call
130 * super.resetInputs in case you override this method.
132 public void resetInputs() throws IOException {
138 * Sets the {@link Config} for this content source. If you override this
139 * method, you must call super.setConfig.
141 public void setConfig(Config config) {
142 this.config = config;
143 forever = config.get("content.source.forever", true);
144 logStep = config.get("content.source.log.step", 0);
145 verbose = config.get("content.source.verbose", false);
146 encoding = config.get("content.source.encoding", null);