1 package org.apache.lucene.util;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.Closeable;
22 import java.io.FileInputStream;
23 import java.io.IOException;
24 import java.io.BufferedReader;
25 import java.io.InputStreamReader;
26 import java.io.InputStream;
27 import java.util.concurrent.atomic.AtomicInteger;
28 import java.util.zip.GZIPInputStream;
29 import java.util.Random;
31 import org.apache.lucene.document.Document;
32 import org.apache.lucene.document.Field;
34 /** Minimal port of contrib/benchmark's LneDocSource +
35 * DocMaker, so tests can enum docs from a line file created
36 * by contrib/benchmark's WriteLineDoc task */
37 public class LineFileDocs implements Closeable {
39 private BufferedReader reader;
40 private final static int BUFFER_SIZE = 1 << 16; // 64K
41 private final AtomicInteger id = new AtomicInteger();
42 private final String path;
44 /** If forever is true, we rewind the file at EOF (repeat
45 * the docs over and over) */
46 public LineFileDocs(Random random, String path) throws IOException {
51 public LineFileDocs(Random random) throws IOException {
52 this(random, LuceneTestCase.TEST_LINE_DOCS_FILE);
55 public synchronized void close() throws IOException {
62 private synchronized void open(Random random) throws IOException {
63 InputStream is = getClass().getResourceAsStream(path);
65 // if its not in classpath, we load it as absolute filesystem path (e.g. Hudson's home dir)
66 is = new FileInputStream(path);
68 File file = new File(path);
73 size = is.available();
75 if (path.endsWith(".gz")) {
76 is = new GZIPInputStream(is);
81 reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), BUFFER_SIZE);
83 // Override sizes for currently "known" line files:
84 if (path.equals("europarl.lines.txt.gz")) {
86 } else if (path.equals("/home/hudson/lucene-data/enwiki.random.lines.txt.gz")) {
90 // Randomly seek to starting point:
91 if (random != null && size > 3) {
92 final long seekTo = (random.nextLong()&Long.MAX_VALUE) % (size/3);
93 if (LuceneTestCase.VERBOSE) {
94 System.out.println("TEST: LineFileDocs: seek to fp=" + seekTo + " on open");
101 public synchronized void reset(Random random) throws IOException {
107 private final static char SEP = '\t';
109 private static final class DocState {
111 final Field titleTokenized;
118 doc = new Document();
120 title = new Field("title", "", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS);
123 titleTokenized = new Field("titleTokenized", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
124 doc.add(titleTokenized);
126 body = new Field("body", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
129 id = new Field("docid", "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
132 date = new Field("date", "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
137 private final ThreadLocal<DocState> threadDocs = new ThreadLocal<DocState>();
139 /** Note: Document instance is re-used per-thread */
140 public Document nextDoc() throws IOException {
143 line = reader.readLine();
145 // Always rewind at end:
146 if (LuceneTestCase.VERBOSE) {
147 System.out.println("TEST: LineFileDocs: now rewind file...");
151 line = reader.readLine();
155 DocState docState = threadDocs.get();
156 if (docState == null) {
157 docState = new DocState();
158 threadDocs.set(docState);
161 int spot = line.indexOf(SEP);
163 throw new RuntimeException("line: [" + line + "] is in an invalid format !");
165 int spot2 = line.indexOf(SEP, 1 + spot);
167 throw new RuntimeException("line: [" + line + "] is in an invalid format !");
170 docState.body.setValue(line.substring(1+spot2, line.length()));
171 final String title = line.substring(0, spot);
172 docState.title.setValue(title);
173 docState.titleTokenized.setValue(title);
174 docState.date.setValue(line.substring(1+spot, spot2));
175 docState.id.setValue(Integer.toString(id.getAndIncrement()));