X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java?ds=sidebyside diff --git a/lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java b/lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java deleted file mode 100644 index 3069c27..0000000 --- a/lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java +++ /dev/null @@ -1,350 +0,0 @@ -package org.apache.lucene.benchmark.byTask.feeds; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.BufferedReader; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; -import java.text.DateFormat; -import java.text.ParsePosition; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Date; -import java.util.Locale; - -import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType; -import org.apache.lucene.benchmark.byTask.utils.Config; -import org.apache.lucene.benchmark.byTask.utils.StreamUtils; -import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader; -import org.apache.lucene.util.ThreadInterruptedException; - -/** - * Implements a {@link ContentSource} over the TREC collection. - *

- * Supports the following configuration parameters (on top of - * {@link ContentSource}): - *

- */ -public class TrecContentSource extends ContentSource { - - private static final class DateFormatInfo { - DateFormat[] dfs; - ParsePosition pos; - } - - public static final String DOCNO = ""; - public static final String TERMINATING_DOCNO = ""; - public static final String DOC = ""; - public static final String TERMINATING_DOC = ""; - - /** separator between lines in the byffer */ - public static final String NEW_LINE = System.getProperty("line.separator"); - - private static final String DATE_FORMATS [] = { - "EEE, dd MMM yyyy kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT - "EEE MMM dd kk:mm:ss yyyy z", // Tue Dec 09 16:45:08 2003 EST - "EEE, dd-MMM-':'y kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT - "EEE, dd-MMM-yyy kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT - "EEE MMM dd kk:mm:ss yyyy", // Tue Dec 09 16:45:08 2003 - "dd MMM yyyy", // 1 March 1994 - "MMM dd, yyyy", // February 3, 1994 - "yyMMdd", // 910513 - "hhmm z.z.z. MMM dd, yyyy", // 0901 u.t.c. April 28, 1994 - }; - - private ThreadLocal dateFormats = new ThreadLocal(); - private ThreadLocal trecDocReader = new ThreadLocal(); - private ThreadLocal trecDocBuffer = new ThreadLocal(); - private File dataDir = null; - private ArrayList inputFiles = new ArrayList(); - private int nextFile = 0; - private int rawDocSize = 0; - - // Use to synchronize threads on reading from the TREC documents. - private Object lock = new Object(); - - // Required for test - BufferedReader reader; - int iteration = 0; - HTMLParser htmlParser; - - private boolean excludeDocnameIteration; - private TrecDocParser trecDocParser = new TrecGov2Parser(); // default - ParsePathType currPathType; // not private for tests - - private DateFormatInfo getDateFormatInfo() { - DateFormatInfo dfi = dateFormats.get(); - if (dfi == null) { - dfi = new DateFormatInfo(); - dfi.dfs = new SimpleDateFormat[DATE_FORMATS.length]; - for (int i = 0; i < dfi.dfs.length; i++) { - dfi.dfs[i] = new SimpleDateFormat(DATE_FORMATS[i], Locale.US); - dfi.dfs[i].setLenient(true); - } - dfi.pos = new ParsePosition(0); - dateFormats.set(dfi); - } - return dfi; - } - - private StringBuilder getDocBuffer() { - StringBuilder sb = trecDocBuffer.get(); - if (sb == null) { - sb = new StringBuilder(); - trecDocBuffer.set(sb); - } - return sb; - } - - Reader getTrecDocReader(StringBuilder docBuffer) { - StringBuilderReader r = trecDocReader.get(); - if (r == null) { - r = new StringBuilderReader(docBuffer); - trecDocReader.set(r); - } else { - r.set(docBuffer); - } - return r; - } - - HTMLParser getHtmlParser() { - return htmlParser; - } - - /** - * Read until a line starting with the specified lineStart. - * @param buf buffer for collecting the data if so specified/ - * @param lineStart line start to look for, must not be null. - * @param collectMatchLine whether to collect the matching line into buffer. - * @param collectAll whether to collect all lines into buffer. - * @throws IOException - * @throws NoMoreDataException - */ - private void read(StringBuilder buf, String lineStart, - boolean collectMatchLine, boolean collectAll) throws IOException, NoMoreDataException { - String sep = ""; - while (true) { - String line = reader.readLine(); - - if (line == null) { - openNextFile(); - continue; - } - - rawDocSize += line.length(); - - if (lineStart!=null && line.startsWith(lineStart)) { - if (collectMatchLine) { - buf.append(sep).append(line); - sep = NEW_LINE; - } - return; - } - - if (collectAll) { - buf.append(sep).append(line); - sep = NEW_LINE; - } - } - } - - void openNextFile() throws NoMoreDataException, IOException { - close(); - currPathType = null; - while (true) { - if (nextFile >= inputFiles.size()) { - // exhausted files, start a new round, unless forever set to false. - if (!forever) { - throw new NoMoreDataException(); - } - nextFile = 0; - iteration++; - } - File f = inputFiles.get(nextFile++); - if (verbose) { - System.out.println("opening: " + f + " length: " + f.length()); - } - try { - InputStream inputStream = StreamUtils.inputStream(f); // support either gzip, bzip2, or regular text file, by extension - reader = new BufferedReader(new InputStreamReader(inputStream, encoding), StreamUtils.BUFFER_SIZE); - currPathType = TrecDocParser.pathType(f); - return; - } catch (Exception e) { - if (verbose) { - System.out.println("Skipping 'bad' file " + f.getAbsolutePath()+" due to "+e.getMessage()); - continue; - } - throw new NoMoreDataException(); - } - } - } - - public Date parseDate(String dateStr) { - dateStr = dateStr.trim(); - DateFormatInfo dfi = getDateFormatInfo(); - for (int i = 0; i < dfi.dfs.length; i++) { - DateFormat df = dfi.dfs[i]; - dfi.pos.setIndex(0); - dfi.pos.setErrorIndex(-1); - Date d = df.parse(dateStr, dfi.pos); - if (d != null) { - // Parse succeeded. - return d; - } - } - // do not fail test just because a date could not be parsed - if (verbose) { - System.out.println("failed to parse date (assigning 'now') for: " + dateStr); - } - return null; - } - - @Override - public void close() throws IOException { - if (reader == null) { - return; - } - - try { - reader.close(); - } catch (IOException e) { - if (verbose) { - System.out.println("failed to close reader !"); - e.printStackTrace(System.out); - } - } - reader = null; - } - - @Override - public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { - String name = null; - StringBuilder docBuf = getDocBuffer(); - ParsePathType parsedPathType; - - // protect reading from the TREC files by multiple threads. The rest of the - // method, i.e., parsing the content and returning the DocData can run unprotected. - synchronized (lock) { - if (reader == null) { - openNextFile(); - } - - // 1. skip until doc start - required for all TREC formats - docBuf.setLength(0); - read(docBuf, DOC, false, false); - - // save parsedFile for passing trecDataParser after the sync block, in - // case another thread will open another file in between. - parsedPathType = currPathType; - - // 2. name - required for all TREC formats - docBuf.setLength(0); - read(docBuf, DOCNO, true, false); - name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO, - DOCNO.length())).trim(); - - if (!excludeDocnameIteration) { - name = name + "_" + iteration; - } - - // 3. read all until end of doc - docBuf.setLength(0); - read(docBuf, TERMINATING_DOC, false, true); - } - - // count char length of text to be parsed (may be larger than the resulted plain doc body text). - addBytes(docBuf.length()); - - // This code segment relies on HtmlParser being thread safe. When we get - // here, everything else is already private to that thread, so we're safe. - try { - docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType); - addDoc(); - } catch (InterruptedException ie) { - throw new ThreadInterruptedException(ie); - } - - return docData; - } - - @Override - public void resetInputs() throws IOException { - synchronized (lock) { - super.resetInputs(); - close(); - nextFile = 0; - iteration = 0; - } - } - - @Override - public void setConfig(Config config) { - super.setConfig(config); - // dirs - File workDir = new File(config.get("work.dir", "work")); - String d = config.get("docs.dir", "trec"); - dataDir = new File(d); - if (!dataDir.isAbsolute()) { - dataDir = new File(workDir, d); - } - // files - collectFiles(dataDir, inputFiles); - if (inputFiles.size() == 0) { - throw new IllegalArgumentException("No files in dataDir: " + dataDir); - } - // trec doc parser - try { - String trecDocParserClassName = config.get("trec.doc.parser", "org.apache.lucene.benchmark.byTask.feeds.TrecGov2Parser"); - trecDocParser = Class.forName(trecDocParserClassName).asSubclass(TrecDocParser.class).newInstance(); - } catch (Exception e) { - // Should not get here. Throw runtime exception. - throw new RuntimeException(e); - } - // html parser - try { - String htmlParserClassName = config.get("html.parser", - "org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser"); - htmlParser = Class.forName(htmlParserClassName).asSubclass(HTMLParser.class).newInstance(); - } catch (Exception e) { - // Should not get here. Throw runtime exception. - throw new RuntimeException(e); - } - // encoding - if (encoding == null) { - encoding = "ISO-8859-1"; - } - // iteration exclusion in doc name - excludeDocnameIteration = config.get("content.source.excludeIteration", false); - } - -}