X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
diff --git a/lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java b/lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
deleted file mode 100644
index 142e408..0000000
--- a/lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
+++ /dev/null
@@ -1,502 +0,0 @@
-package org.apache.lucene.benchmark.byTask.feeds;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.UnsupportedEncodingException;
-import java.util.HashMap;
-import java.util.Calendar;
-import java.util.Map;
-import java.util.Properties;
-import java.util.Locale;
-import java.util.Random;
-import java.util.Date;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.text.SimpleDateFormat;
-import java.text.ParsePosition;
-
-import org.apache.lucene.benchmark.byTask.utils.Config;
-import org.apache.lucene.benchmark.byTask.utils.Format;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.NumericField;
-import org.apache.lucene.document.Field.Index;
-import org.apache.lucene.document.Field.Store;
-import org.apache.lucene.document.Field.TermVector;
-
-/**
- * Creates {@link Document} objects. Uses a {@link ContentSource} to generate
- * {@link DocData} objects. Supports the following parameters:
- *
- *
content.source - specifies the {@link ContentSource} class to use
- * (default SingleDocSource).
- *
doc.stored - specifies whether fields should be stored (default
- * false).
- *
doc.body.stored - specifies whether the body field should be stored (default
- * = doc.stored).
- *
doc.tokenized - specifies whether fields should be tokenized
- * (default true).
- *
doc.body.tokenized - specifies whether the
- * body field should be tokenized (default = doc.tokenized).
- *
doc.tokenized.norms - specifies whether norms should be stored in
- * the index or not. (default false).
- *
doc.body.tokenized.norms - specifies whether norms should be
- * stored in the index for the body field. This can be set to true, while
- * doc.tokenized.norms is set to false, to allow norms storing just
- * for the body field. (default true).
- *
doc.term.vector - specifies whether term vectors should be stored
- * for fields (default false).
- *
doc.term.vector.positions - specifies whether term vectors should
- * be stored with positions (default false).
- *
doc.term.vector.offsets - specifies whether term vectors should be
- * stored with offsets (default false).
- *
doc.store.body.bytes - specifies whether to store the raw bytes of
- * the document's content in the document (default false).
- *
doc.reuse.fields - specifies whether Field and Document objects
- * should be reused (default true).
- *
doc.index.props - specifies whether the properties returned by
- *
doc.random.id.limit - if specified, docs will be assigned random
- * IDs from 0 to this limit. This is useful with UpdateDoc
- * for testing performance of IndexWriter.updateDocument.
- * {@link DocData#getProps()} will be indexed. (default false).
- *
- */
-public class DocMaker {
-
- private static class LeftOver {
- private DocData docdata;
- private int cnt;
- }
-
- private Random r;
- private int updateDocIDLimit;
-
- static class DocState {
-
- private final Map fields;
- private final Map numericFields;
- private final boolean reuseFields;
- final Document doc;
- DocData docData = new DocData();
-
- public DocState(boolean reuseFields, Store store, Store bodyStore, Index index, Index bodyIndex, TermVector termVector) {
-
- this.reuseFields = reuseFields;
-
- if (reuseFields) {
- fields = new HashMap();
- numericFields = new HashMap();
-
- // Initialize the map with the default fields.
- fields.put(BODY_FIELD, new Field(BODY_FIELD, "", bodyStore, bodyIndex, termVector));
- fields.put(TITLE_FIELD, new Field(TITLE_FIELD, "", store, index, termVector));
- fields.put(DATE_FIELD, new Field(DATE_FIELD, "", store, index, termVector));
- fields.put(ID_FIELD, new Field(ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
- fields.put(NAME_FIELD, new Field(NAME_FIELD, "", store, index, termVector));
-
- numericFields.put(DATE_MSEC_FIELD, new NumericField(DATE_MSEC_FIELD));
- numericFields.put(TIME_SEC_FIELD, new NumericField(TIME_SEC_FIELD));
-
- doc = new Document();
- } else {
- numericFields = null;
- fields = null;
- doc = null;
- }
- }
-
- /**
- * Returns a field corresponding to the field name. If
- * reuseFields was set to true, then it attempts to reuse a
- * Field instance. If such a field does not exist, it creates a new one.
- */
- Field getField(String name, Store store, Index index, TermVector termVector) {
- if (!reuseFields) {
- return new Field(name, "", store, index, termVector);
- }
-
- Field f = fields.get(name);
- if (f == null) {
- f = new Field(name, "", store, index, termVector);
- fields.put(name, f);
- }
- return f;
- }
-
- NumericField getNumericField(String name) {
- if (!reuseFields) {
- return new NumericField(name);
- }
-
- NumericField f = numericFields.get(name);
- if (f == null) {
- f = new NumericField(name);
- numericFields.put(name, f);
- }
- return f;
- }
- }
-
- private boolean storeBytes = false;
-
- private static class DateUtil {
- public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.US);
- public Calendar cal = Calendar.getInstance();
- public ParsePosition pos = new ParsePosition(0);
- public DateUtil() {
- parser.setLenient(true);
- }
- }
-
- // leftovers are thread local, because it is unsafe to share residues between threads
- private ThreadLocal leftovr = new ThreadLocal();
- private ThreadLocal docState = new ThreadLocal();
- private ThreadLocal dateParsers = new ThreadLocal();
-
- public static final String BODY_FIELD = "body";
- public static final String TITLE_FIELD = "doctitle";
- public static final String DATE_FIELD = "docdate";
- public static final String DATE_MSEC_FIELD = "docdatenum";
- public static final String TIME_SEC_FIELD = "doctimesecnum";
- public static final String ID_FIELD = "docid";
- public static final String BYTES_FIELD = "bytes";
- public static final String NAME_FIELD = "docname";
-
- protected Config config;
-
- protected Store storeVal = Store.NO;
- protected Store bodyStoreVal = Store.NO;
- protected Index indexVal = Index.ANALYZED_NO_NORMS;
- protected Index bodyIndexVal = Index.ANALYZED;
- protected TermVector termVecVal = TermVector.NO;
-
- protected ContentSource source;
- protected boolean reuseFields;
- protected boolean indexProperties;
-
- private int lastPrintedNumUniqueTexts = 0;
-
- private long lastPrintedNumUniqueBytes = 0;
- private final AtomicInteger numDocsCreated = new AtomicInteger();
-
- private int printNum = 0;
-
- // create a doc
- // use only part of the body, modify it to keep the rest (or use all if size==0).
- // reset the docdata properties so they are not added more than once.
- private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException {
-
- final DocState ds = getDocState();
- final Document doc = reuseFields ? ds.doc : new Document();
- doc.getFields().clear();
-
- // Set ID_FIELD
- Field idField = ds.getField(ID_FIELD, storeVal, Index.NOT_ANALYZED_NO_NORMS, termVecVal);
- int id;
- if (r != null) {
- id = r.nextInt(updateDocIDLimit);
- } else {
- id = docData.getID();
- if (id == -1) {
- id = numDocsCreated.getAndIncrement();
- }
- }
- idField.setValue(Integer.toString(id));
- doc.add(idField);
-
- // Set NAME_FIELD
- String name = docData.getName();
- if (name == null) name = "";
- name = cnt < 0 ? name : name + "_" + cnt;
- Field nameField = ds.getField(NAME_FIELD, storeVal, indexVal, termVecVal);
- nameField.setValue(name);
- doc.add(nameField);
-
- // Set DATE_FIELD
- DateUtil util = dateParsers.get();
- if (util == null) {
- util = new DateUtil();
- dateParsers.set(util);
- }
- Date date = null;
- String dateString = docData.getDate();
- if (dateString != null) {
- util.pos.setIndex(0);
- date = util.parser.parse(dateString, util.pos);
- //System.out.println(dateString + " parsed to " + date);
- } else {
- dateString = "";
- }
- Field dateStringField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
- dateStringField.setValue(dateString);
- doc.add(dateStringField);
-
- if (date == null) {
- // just set to right now
- date = new Date();
- }
-
- NumericField dateField = ds.getNumericField(DATE_MSEC_FIELD);
- dateField.setLongValue(date.getTime());
- doc.add(dateField);
-
- util.cal.setTime(date);
- final int sec = util.cal.get(Calendar.HOUR_OF_DAY)*3600 + util.cal.get(Calendar.MINUTE)*60 + util.cal.get(Calendar.SECOND);
-
- NumericField timeSecField = ds.getNumericField(TIME_SEC_FIELD);
- timeSecField.setIntValue(sec);
- doc.add(timeSecField);
-
- // Set TITLE_FIELD
- String title = docData.getTitle();
- Field titleField = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal);
- titleField.setValue(title == null ? "" : title);
- doc.add(titleField);
-
- String body = docData.getBody();
- if (body != null && body.length() > 0) {
- String bdy;
- if (size <= 0 || size >= body.length()) {
- bdy = body; // use all
- docData.setBody(""); // nothing left
- } else {
- // attempt not to break words - if whitespace found within next 20 chars...
- for (int n = size - 1; n < size + 20 && n < body.length(); n++) {
- if (Character.isWhitespace(body.charAt(n))) {
- size = n;
- break;
- }
- }
- bdy = body.substring(0, size); // use part
- docData.setBody(body.substring(size)); // some left
- }
- Field bodyField = ds.getField(BODY_FIELD, bodyStoreVal, bodyIndexVal, termVecVal);
- bodyField.setValue(bdy);
- doc.add(bodyField);
-
- if (storeBytes) {
- Field bytesField = ds.getField(BYTES_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
- bytesField.setValue(bdy.getBytes("UTF-8"));
- doc.add(bytesField);
- }
- }
-
- if (indexProperties) {
- Properties props = docData.getProps();
- if (props != null) {
- for (final Map.Entry