X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/backwards/src/test-framework/org/apache/lucene/util/LineFileDocs.java diff --git a/lucene-java-3.5.0/lucene/backwards/src/test-framework/org/apache/lucene/util/LineFileDocs.java b/lucene-java-3.5.0/lucene/backwards/src/test-framework/org/apache/lucene/util/LineFileDocs.java new file mode 100644 index 0000000..a4cd41f --- /dev/null +++ b/lucene-java-3.5.0/lucene/backwards/src/test-framework/org/apache/lucene/util/LineFileDocs.java @@ -0,0 +1,178 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Closeable; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.InputStream; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.zip.GZIPInputStream; +import java.util.Random; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +/** Minimal port of contrib/benchmark's LneDocSource + + * DocMaker, so tests can enum docs from a line file created + * by contrib/benchmark's WriteLineDoc task */ +public class LineFileDocs implements Closeable { + + private BufferedReader reader; + private final static int BUFFER_SIZE = 1 << 16; // 64K + private final AtomicInteger id = new AtomicInteger(); + private final String path; + + /** If forever is true, we rewind the file at EOF (repeat + * the docs over and over) */ + public LineFileDocs(Random random, String path) throws IOException { + this.path = path; + open(random); + } + + public LineFileDocs(Random random) throws IOException { + this(random, LuceneTestCase.TEST_LINE_DOCS_FILE); + } + + public synchronized void close() throws IOException { + if (reader != null) { + reader.close(); + reader = null; + } + } + + private synchronized void open(Random random) throws IOException { + InputStream is = getClass().getResourceAsStream(path); + if (is == null) { + // if its not in classpath, we load it as absolute filesystem path (e.g. Hudson's home dir) + is = new FileInputStream(path); + } + File file = new File(path); + long size; + if (file.exists()) { + size = file.length(); + } else { + size = is.available(); + } + if (path.endsWith(".gz")) { + is = new GZIPInputStream(is); + // guestimate: + size *= 2.8; + } + + reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), BUFFER_SIZE); + + // Override sizes for currently "known" line files: + if (path.equals("europarl.lines.txt.gz")) { + size = 15129506L; + } else if (path.equals("/home/hudson/lucene-data/enwiki.random.lines.txt.gz")) { + size = 3038178822L; + } + + // Randomly seek to starting point: + if (random != null && size > 3) { + final long seekTo = (random.nextLong()&Long.MAX_VALUE) % (size/3); + if (LuceneTestCase.VERBOSE) { + System.out.println("TEST: LineFileDocs: seek to fp=" + seekTo + " on open"); + } + reader.skip(seekTo); + reader.readLine(); + } + } + + public synchronized void reset(Random random) throws IOException { + close(); + open(random); + id.set(0); + } + + private final static char SEP = '\t'; + + private static final class DocState { + final Document doc; + final Field titleTokenized; + final Field title; + final Field body; + final Field id; + final Field date; + + public DocState() { + doc = new Document(); + + title = new Field("title", "", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS); + doc.add(title); + + titleTokenized = new Field("titleTokenized", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(titleTokenized); + + body = new Field("body", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); + doc.add(body); + + id = new Field("docid", "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); + doc.add(id); + + date = new Field("date", "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); + doc.add(date); + } + } + + private final ThreadLocal threadDocs = new ThreadLocal(); + + /** Note: Document instance is re-used per-thread */ + public Document nextDoc() throws IOException { + String line; + synchronized(this) { + line = reader.readLine(); + if (line == null) { + // Always rewind at end: + if (LuceneTestCase.VERBOSE) { + System.out.println("TEST: LineFileDocs: now rewind file..."); + } + close(); + open(null); + line = reader.readLine(); + } + } + + DocState docState = threadDocs.get(); + if (docState == null) { + docState = new DocState(); + threadDocs.set(docState); + } + + int spot = line.indexOf(SEP); + if (spot == -1) { + throw new RuntimeException("line: [" + line + "] is in an invalid format !"); + } + int spot2 = line.indexOf(SEP, 1 + spot); + if (spot2 == -1) { + throw new RuntimeException("line: [" + line + "] is in an invalid format !"); + } + + docState.body.setValue(line.substring(1+spot2, line.length())); + final String title = line.substring(0, spot); + docState.title.setValue(title); + docState.titleTokenized.setValue(title); + docState.date.setValue(line.substring(1+spot, spot2)); + docState.id.setValue(Integer.toString(id.getAndIncrement())); + return docState.doc; + } +}