X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java diff --git a/lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java b/lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java new file mode 100644 index 0000000..bb459c8 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java @@ -0,0 +1,312 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.benchmark.byTask.utils.StreamUtils; +import org.apache.lucene.util.ThreadInterruptedException; +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; +import org.xml.sax.helpers.DefaultHandler; +import org.xml.sax.helpers.XMLReaderFactory; + +/** + * A {@link ContentSource} which reads the English Wikipedia dump. You can read + * the .bz2 file directly (it will be decompressed on the fly). Config + * properties: + * + */ +public class EnwikiContentSource extends ContentSource { + + @SuppressWarnings("synthetic-access") + private class Parser extends DefaultHandler implements Runnable { + private Thread t; + private boolean threadDone; + private String[] tuple; + private NoMoreDataException nmde; + private StringBuilder contents = new StringBuilder(); + private String title; + private String body; + private String time; + private String id; + + public Parser() { + } + + String[] next() throws NoMoreDataException { + if (t == null) { + threadDone = false; + t = new Thread(this); + t.setDaemon(true); + t.start(); + } + String[] result; + synchronized(this){ + while(tuple == null && nmde == null && !threadDone) { + try { + wait(); + } catch (InterruptedException ie) { + throw new ThreadInterruptedException(ie); + } + } + if (nmde != null) { + // Set to null so we will re-start thread in case + // we are re-used: + t = null; + throw nmde; + } + if (t != null && threadDone) { + // The thread has exited yet did not hit end of + // data, so this means it hit an exception. We + // throw NoMorDataException here to force + // benchmark to stop the current alg: + throw new NoMoreDataException(); + } + result = tuple; + tuple = null; + notify(); + } + return result; + } + + String time(String original) { + StringBuilder buffer = new StringBuilder(); + + buffer.append(original.substring(8, 10)); + buffer.append('-'); + buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]); + buffer.append('-'); + buffer.append(original.substring(0, 4)); + buffer.append(' '); + buffer.append(original.substring(11, 19)); + buffer.append(".000"); + + return buffer.toString(); + } + + @Override + public void characters(char[] ch, int start, int length) { + contents.append(ch, start, length); + } + + @Override + public void endElement(String namespace, String simple, String qualified) + throws SAXException { + int elemType = getElementType(qualified); + switch (elemType) { + case PAGE: + // the body must be null and we either are keeping image docs or the + // title does not start with Image: + if (body != null && (keepImages || !title.startsWith("Image:"))) { + String[] tmpTuple = new String[LENGTH]; + tmpTuple[TITLE] = title.replace('\t', ' '); + tmpTuple[DATE] = time.replace('\t', ' '); + tmpTuple[BODY] = body.replaceAll("[\t\n]", " "); + tmpTuple[ID] = id; + synchronized(this) { + while (tuple != null) { + try { + wait(); + } catch (InterruptedException ie) { + throw new ThreadInterruptedException(ie); + } + } + tuple = tmpTuple; + notify(); + } + } + break; + case BODY: + body = contents.toString(); + //workaround that startswith doesn't have an ignore case option, get at least 20 chars. + String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(); + if (startsWith.startsWith("#redirect")) { + body = null; + } + break; + case DATE: + time = time(contents.toString()); + break; + case TITLE: + title = contents.toString(); + break; + case ID: + //the doc id is the first one in the page. All other ids after that one can be ignored according to the schema + if (id == null) { + id = contents.toString(); + } + break; + default: + // this element should be discarded. + } + } + + public void run() { + + try { + XMLReader reader = XMLReaderFactory.createXMLReader(); + reader.setContentHandler(this); + reader.setErrorHandler(this); + while(true){ + final InputStream localFileIS = is; + try { + reader.parse(new InputSource(localFileIS)); + } catch (IOException ioe) { + synchronized(EnwikiContentSource.this) { + if (localFileIS != is) { + // fileIS was closed on us, so, just fall + // through + } else + // Exception is real + throw ioe; + } + } + synchronized(this) { + if (!forever) { + nmde = new NoMoreDataException(); + notify(); + return; + } else if (localFileIS == is) { + // If file is not already re-opened then re-open it now + is = StreamUtils.inputStream(file); + } + } + } + } catch (SAXException sae) { + throw new RuntimeException(sae); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } finally { + synchronized(this) { + threadDone = true; + notify(); + } + } + } + + @Override + public void startElement(String namespace, String simple, String qualified, + Attributes attributes) { + int elemType = getElementType(qualified); + switch (elemType) { + case PAGE: + title = null; + body = null; + time = null; + id = null; + break; + // intentional fall-through. + case BODY: + case DATE: + case TITLE: + case ID: + contents.setLength(0); + break; + default: + // this element should be discarded. + } + } + } + + private static final Map ELEMENTS = new HashMap(); + private static final int TITLE = 0; + private static final int DATE = TITLE + 1; + private static final int BODY = DATE + 1; + private static final int ID = BODY + 1; + private static final int LENGTH = ID + 1; + // LENGTH is used as the size of the tuple, so whatever constants we need that + // should not be part of the tuple, we should define them after LENGTH. + private static final int PAGE = LENGTH + 1; + + private static final String[] months = {"JAN", "FEB", "MAR", "APR", + "MAY", "JUN", "JUL", "AUG", + "SEP", "OCT", "NOV", "DEC"}; + + static { + ELEMENTS.put("page", Integer.valueOf(PAGE)); + ELEMENTS.put("text", Integer.valueOf(BODY)); + ELEMENTS.put("timestamp", Integer.valueOf(DATE)); + ELEMENTS.put("title", Integer.valueOf(TITLE)); + ELEMENTS.put("id", Integer.valueOf(ID)); + } + + /** + * Returns the type of the element if defined, otherwise returns -1. This + * method is useful in startElement and endElement, by not needing to compare + * the element qualified name over and over. + */ + private final static int getElementType(String elem) { + Integer val = ELEMENTS.get(elem); + return val == null ? -1 : val.intValue(); + } + + private File file; + private boolean keepImages = true; + private InputStream is; + private Parser parser = new Parser(); + + @Override + public void close() throws IOException { + synchronized (EnwikiContentSource.this) { + if (is != null) { + is.close(); + is = null; + } + } + } + + @Override + public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { + String[] tuple = parser.next(); + docData.clear(); + docData.setName(tuple[ID]); + docData.setBody(tuple[BODY]); + docData.setDate(tuple[DATE]); + docData.setTitle(tuple[TITLE]); + return docData; + } + + @Override + public void resetInputs() throws IOException { + super.resetInputs(); + is = StreamUtils.inputStream(file); + } + + @Override + public void setConfig(Config config) { + super.setConfig(config); + keepImages = config.get("keep.image.only.docs", true); + String fileName = config.get("docs.file", null); + if (fileName == null) { + throw new IllegalArgumentException("docs.file must be set"); + } + file = new File(fileName).getAbsoluteFile(); + } + +}