X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java?ds=sidebyside diff --git a/lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java b/lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java deleted file mode 100644 index 5153ad0..0000000 --- a/lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java +++ /dev/null @@ -1,308 +0,0 @@ -package org.apache.lucene.benchmark.byTask.feeds; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.benchmark.byTask.utils.Config; -import org.apache.lucene.benchmark.byTask.utils.StreamUtils; -import org.apache.lucene.util.ThreadInterruptedException; -import org.xml.sax.Attributes; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; -import org.xml.sax.XMLReader; -import org.xml.sax.helpers.DefaultHandler; -import org.xml.sax.helpers.XMLReaderFactory; - -/** - * A {@link ContentSource} which reads the English Wikipedia dump. You can read - * the .bz2 file directly (it will be decompressed on the fly). Config - * properties: - * - */ -public class EnwikiContentSource extends ContentSource { - - private class Parser extends DefaultHandler implements Runnable { - private Thread t; - private boolean threadDone; - private String[] tuple; - private NoMoreDataException nmde; - private StringBuilder contents = new StringBuilder(); - private String title; - private String body; - private String time; - private String id; - - String[] next() throws NoMoreDataException { - if (t == null) { - threadDone = false; - t = new Thread(this); - t.setDaemon(true); - t.start(); - } - String[] result; - synchronized(this){ - while(tuple == null && nmde == null && !threadDone) { - try { - wait(); - } catch (InterruptedException ie) { - throw new ThreadInterruptedException(ie); - } - } - if (nmde != null) { - // Set to null so we will re-start thread in case - // we are re-used: - t = null; - throw nmde; - } - if (t != null && threadDone) { - // The thread has exited yet did not hit end of - // data, so this means it hit an exception. We - // throw NoMorDataException here to force - // benchmark to stop the current alg: - throw new NoMoreDataException(); - } - result = tuple; - tuple = null; - notify(); - } - return result; - } - - String time(String original) { - StringBuilder buffer = new StringBuilder(); - - buffer.append(original.substring(8, 10)); - buffer.append('-'); - buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]); - buffer.append('-'); - buffer.append(original.substring(0, 4)); - buffer.append(' '); - buffer.append(original.substring(11, 19)); - buffer.append(".000"); - - return buffer.toString(); - } - - @Override - public void characters(char[] ch, int start, int length) { - contents.append(ch, start, length); - } - - @Override - public void endElement(String namespace, String simple, String qualified) - throws SAXException { - int elemType = getElementType(qualified); - switch (elemType) { - case PAGE: - // the body must be null and we either are keeping image docs or the - // title does not start with Image: - if (body != null && (keepImages || !title.startsWith("Image:"))) { - String[] tmpTuple = new String[LENGTH]; - tmpTuple[TITLE] = title.replace('\t', ' '); - tmpTuple[DATE] = time.replace('\t', ' '); - tmpTuple[BODY] = body.replaceAll("[\t\n]", " "); - tmpTuple[ID] = id; - synchronized(this) { - while (tuple != null) { - try { - wait(); - } catch (InterruptedException ie) { - throw new ThreadInterruptedException(ie); - } - } - tuple = tmpTuple; - notify(); - } - } - break; - case BODY: - body = contents.toString(); - //workaround that startswith doesn't have an ignore case option, get at least 20 chars. - String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(); - if (startsWith.startsWith("#redirect")) { - body = null; - } - break; - case DATE: - time = time(contents.toString()); - break; - case TITLE: - title = contents.toString(); - break; - case ID: - //the doc id is the first one in the page. All other ids after that one can be ignored according to the schema - if (id == null) { - id = contents.toString(); - } - break; - default: - // this element should be discarded. - } - } - - public void run() { - - try { - XMLReader reader = XMLReaderFactory.createXMLReader(); - reader.setContentHandler(this); - reader.setErrorHandler(this); - while(true){ - final InputStream localFileIS = is; - try { - reader.parse(new InputSource(localFileIS)); - } catch (IOException ioe) { - synchronized(EnwikiContentSource.this) { - if (localFileIS != is) { - // fileIS was closed on us, so, just fall - // through - } else - // Exception is real - throw ioe; - } - } - synchronized(this) { - if (!forever) { - nmde = new NoMoreDataException(); - notify(); - return; - } else if (localFileIS == is) { - // If file is not already re-opened then re-open it now - is = StreamUtils.inputStream(file); - } - } - } - } catch (SAXException sae) { - throw new RuntimeException(sae); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } finally { - synchronized(this) { - threadDone = true; - notify(); - } - } - } - - @Override - public void startElement(String namespace, String simple, String qualified, - Attributes attributes) { - int elemType = getElementType(qualified); - switch (elemType) { - case PAGE: - title = null; - body = null; - time = null; - id = null; - break; - // intentional fall-through. - case BODY: - case DATE: - case TITLE: - case ID: - contents.setLength(0); - break; - default: - // this element should be discarded. - } - } - } - - private static final Map ELEMENTS = new HashMap(); - private static final int TITLE = 0; - private static final int DATE = TITLE + 1; - private static final int BODY = DATE + 1; - private static final int ID = BODY + 1; - private static final int LENGTH = ID + 1; - // LENGTH is used as the size of the tuple, so whatever constants we need that - // should not be part of the tuple, we should define them after LENGTH. - private static final int PAGE = LENGTH + 1; - - private static final String[] months = {"JAN", "FEB", "MAR", "APR", - "MAY", "JUN", "JUL", "AUG", - "SEP", "OCT", "NOV", "DEC"}; - - static { - ELEMENTS.put("page", Integer.valueOf(PAGE)); - ELEMENTS.put("text", Integer.valueOf(BODY)); - ELEMENTS.put("timestamp", Integer.valueOf(DATE)); - ELEMENTS.put("title", Integer.valueOf(TITLE)); - ELEMENTS.put("id", Integer.valueOf(ID)); - } - - /** - * Returns the type of the element if defined, otherwise returns -1. This - * method is useful in startElement and endElement, by not needing to compare - * the element qualified name over and over. - */ - private final static int getElementType(String elem) { - Integer val = ELEMENTS.get(elem); - return val == null ? -1 : val.intValue(); - } - - private File file; - private boolean keepImages = true; - private InputStream is; - private Parser parser = new Parser(); - - @Override - public void close() throws IOException { - synchronized (EnwikiContentSource.this) { - if (is != null) { - is.close(); - is = null; - } - } - } - - @Override - public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { - String[] tuple = parser.next(); - docData.clear(); - docData.setName(tuple[ID]); - docData.setBody(tuple[BODY]); - docData.setDate(tuple[DATE]); - docData.setTitle(tuple[TITLE]); - return docData; - } - - @Override - public void resetInputs() throws IOException { - super.resetInputs(); - is = StreamUtils.inputStream(file); - } - - @Override - public void setConfig(Config config) { - super.setConfig(config); - keepImages = config.get("keep.image.only.docs", true); - String fileName = config.get("docs.file", null); - if (fileName == null) { - throw new IllegalArgumentException("docs.file must be set"); - } - file = new File(fileName).getAbsoluteFile(); - } - -}