--- /dev/null
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
+import org.apache.lucene.util.ThreadInterruptedException;
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+import org.xml.sax.helpers.DefaultHandler;
+import org.xml.sax.helpers.XMLReaderFactory;
+
+/**
+ * A {@link ContentSource} which reads the English Wikipedia dump. You can read
+ * the .bz2 file directly (it will be decompressed on the fly). Config
+ * properties:
+ * <ul>
+ * <li>keep.image.only.docs=false|true (default <b>true</b>).
+ * <li>docs.file=<path to the file>
+ * </ul>
+ */
+public class EnwikiContentSource extends ContentSource {
+
+ @SuppressWarnings("synthetic-access")
+ private class Parser extends DefaultHandler implements Runnable {
+ private Thread t;
+ private boolean threadDone;
+ private String[] tuple;
+ private NoMoreDataException nmde;
+ private StringBuilder contents = new StringBuilder();
+ private String title;
+ private String body;
+ private String time;
+ private String id;
+
+ public Parser() {
+ }
+
+ String[] next() throws NoMoreDataException {
+ if (t == null) {
+ threadDone = false;
+ t = new Thread(this);
+ t.setDaemon(true);
+ t.start();
+ }
+ String[] result;
+ synchronized(this){
+ while(tuple == null && nmde == null && !threadDone) {
+ try {
+ wait();
+ } catch (InterruptedException ie) {
+ throw new ThreadInterruptedException(ie);
+ }
+ }
+ if (nmde != null) {
+ // Set to null so we will re-start thread in case
+ // we are re-used:
+ t = null;
+ throw nmde;
+ }
+ if (t != null && threadDone) {
+ // The thread has exited yet did not hit end of
+ // data, so this means it hit an exception. We
+ // throw NoMorDataException here to force
+ // benchmark to stop the current alg:
+ throw new NoMoreDataException();
+ }
+ result = tuple;
+ tuple = null;
+ notify();
+ }
+ return result;
+ }
+
+ String time(String original) {
+ StringBuilder buffer = new StringBuilder();
+
+ buffer.append(original.substring(8, 10));
+ buffer.append('-');
+ buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
+ buffer.append('-');
+ buffer.append(original.substring(0, 4));
+ buffer.append(' ');
+ buffer.append(original.substring(11, 19));
+ buffer.append(".000");
+
+ return buffer.toString();
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) {
+ contents.append(ch, start, length);
+ }
+
+ @Override
+ public void endElement(String namespace, String simple, String qualified)
+ throws SAXException {
+ int elemType = getElementType(qualified);
+ switch (elemType) {
+ case PAGE:
+ // the body must be null and we either are keeping image docs or the
+ // title does not start with Image:
+ if (body != null && (keepImages || !title.startsWith("Image:"))) {
+ String[] tmpTuple = new String[LENGTH];
+ tmpTuple[TITLE] = title.replace('\t', ' ');
+ tmpTuple[DATE] = time.replace('\t', ' ');
+ tmpTuple[BODY] = body.replaceAll("[\t\n]", " ");
+ tmpTuple[ID] = id;
+ synchronized(this) {
+ while (tuple != null) {
+ try {
+ wait();
+ } catch (InterruptedException ie) {
+ throw new ThreadInterruptedException(ie);
+ }
+ }
+ tuple = tmpTuple;
+ notify();
+ }
+ }
+ break;
+ case BODY:
+ body = contents.toString();
+ //workaround that startswith doesn't have an ignore case option, get at least 20 chars.
+ String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
+ if (startsWith.startsWith("#redirect")) {
+ body = null;
+ }
+ break;
+ case DATE:
+ time = time(contents.toString());
+ break;
+ case TITLE:
+ title = contents.toString();
+ break;
+ case ID:
+ //the doc id is the first one in the page. All other ids after that one can be ignored according to the schema
+ if (id == null) {
+ id = contents.toString();
+ }
+ break;
+ default:
+ // this element should be discarded.
+ }
+ }
+
+ public void run() {
+
+ try {
+ XMLReader reader = XMLReaderFactory.createXMLReader();
+ reader.setContentHandler(this);
+ reader.setErrorHandler(this);
+ while(true){
+ final InputStream localFileIS = is;
+ try {
+ reader.parse(new InputSource(localFileIS));
+ } catch (IOException ioe) {
+ synchronized(EnwikiContentSource.this) {
+ if (localFileIS != is) {
+ // fileIS was closed on us, so, just fall
+ // through
+ } else
+ // Exception is real
+ throw ioe;
+ }
+ }
+ synchronized(this) {
+ if (!forever) {
+ nmde = new NoMoreDataException();
+ notify();
+ return;
+ } else if (localFileIS == is) {
+ // If file is not already re-opened then re-open it now
+ is = StreamUtils.inputStream(file);
+ }
+ }
+ }
+ } catch (SAXException sae) {
+ throw new RuntimeException(sae);
+ } catch (IOException ioe) {
+ throw new RuntimeException(ioe);
+ } finally {
+ synchronized(this) {
+ threadDone = true;
+ notify();
+ }
+ }
+ }
+
+ @Override
+ public void startElement(String namespace, String simple, String qualified,
+ Attributes attributes) {
+ int elemType = getElementType(qualified);
+ switch (elemType) {
+ case PAGE:
+ title = null;
+ body = null;
+ time = null;
+ id = null;
+ break;
+ // intentional fall-through.
+ case BODY:
+ case DATE:
+ case TITLE:
+ case ID:
+ contents.setLength(0);
+ break;
+ default:
+ // this element should be discarded.
+ }
+ }
+ }
+
+ private static final Map<String,Integer> ELEMENTS = new HashMap<String,Integer>();
+ private static final int TITLE = 0;
+ private static final int DATE = TITLE + 1;
+ private static final int BODY = DATE + 1;
+ private static final int ID = BODY + 1;
+ private static final int LENGTH = ID + 1;
+ // LENGTH is used as the size of the tuple, so whatever constants we need that
+ // should not be part of the tuple, we should define them after LENGTH.
+ private static final int PAGE = LENGTH + 1;
+
+ private static final String[] months = {"JAN", "FEB", "MAR", "APR",
+ "MAY", "JUN", "JUL", "AUG",
+ "SEP", "OCT", "NOV", "DEC"};
+
+ static {
+ ELEMENTS.put("page", Integer.valueOf(PAGE));
+ ELEMENTS.put("text", Integer.valueOf(BODY));
+ ELEMENTS.put("timestamp", Integer.valueOf(DATE));
+ ELEMENTS.put("title", Integer.valueOf(TITLE));
+ ELEMENTS.put("id", Integer.valueOf(ID));
+ }
+
+ /**
+ * Returns the type of the element if defined, otherwise returns -1. This
+ * method is useful in startElement and endElement, by not needing to compare
+ * the element qualified name over and over.
+ */
+ private final static int getElementType(String elem) {
+ Integer val = ELEMENTS.get(elem);
+ return val == null ? -1 : val.intValue();
+ }
+
+ private File file;
+ private boolean keepImages = true;
+ private InputStream is;
+ private Parser parser = new Parser();
+
+ @Override
+ public void close() throws IOException {
+ synchronized (EnwikiContentSource.this) {
+ if (is != null) {
+ is.close();
+ is = null;
+ }
+ }
+ }
+
+ @Override
+ public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
+ String[] tuple = parser.next();
+ docData.clear();
+ docData.setName(tuple[ID]);
+ docData.setBody(tuple[BODY]);
+ docData.setDate(tuple[DATE]);
+ docData.setTitle(tuple[TITLE]);
+ return docData;
+ }
+
+ @Override
+ public void resetInputs() throws IOException {
+ super.resetInputs();
+ is = StreamUtils.inputStream(file);
+ }
+
+ @Override
+ public void setConfig(Config config) {
+ super.setConfig(config);
+ keepImages = config.get("keep.image.only.docs", true);
+ String fileName = config.get("docs.file", null);
+ if (fileName == null) {
+ throw new IllegalArgumentException("docs.file must be set");
+ }
+ file = new File(fileName).getAbsoluteFile();
+ }
+
+}