1 package org.apache.lucene.benchmark.byTask.feeds;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.util.HashMap;
26 import org.apache.lucene.benchmark.byTask.utils.Config;
27 import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
28 import org.apache.lucene.util.ThreadInterruptedException;
29 import org.xml.sax.Attributes;
30 import org.xml.sax.InputSource;
31 import org.xml.sax.SAXException;
32 import org.xml.sax.XMLReader;
33 import org.xml.sax.helpers.DefaultHandler;
34 import org.xml.sax.helpers.XMLReaderFactory;
37 * A {@link ContentSource} which reads the English Wikipedia dump. You can read
38 * the .bz2 file directly (it will be decompressed on the fly). Config
41 * <li>keep.image.only.docs=false|true (default <b>true</b>).
42 * <li>docs.file=<path to the file>
45 public class EnwikiContentSource extends ContentSource {
47 private class Parser extends DefaultHandler implements Runnable {
49 private boolean threadDone;
50 private String[] tuple;
51 private NoMoreDataException nmde;
52 private StringBuilder contents = new StringBuilder();
58 String[] next() throws NoMoreDataException {
67 while(tuple == null && nmde == null && !threadDone) {
70 } catch (InterruptedException ie) {
71 throw new ThreadInterruptedException(ie);
75 // Set to null so we will re-start thread in case
80 if (t != null && threadDone) {
81 // The thread has exited yet did not hit end of
82 // data, so this means it hit an exception. We
83 // throw NoMorDataException here to force
84 // benchmark to stop the current alg:
85 throw new NoMoreDataException();
94 String time(String original) {
95 StringBuilder buffer = new StringBuilder();
97 buffer.append(original.substring(8, 10));
99 buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
101 buffer.append(original.substring(0, 4));
103 buffer.append(original.substring(11, 19));
104 buffer.append(".000");
106 return buffer.toString();
110 public void characters(char[] ch, int start, int length) {
111 contents.append(ch, start, length);
115 public void endElement(String namespace, String simple, String qualified)
116 throws SAXException {
117 int elemType = getElementType(qualified);
120 // the body must be null and we either are keeping image docs or the
121 // title does not start with Image:
122 if (body != null && (keepImages || !title.startsWith("Image:"))) {
123 String[] tmpTuple = new String[LENGTH];
124 tmpTuple[TITLE] = title.replace('\t', ' ');
125 tmpTuple[DATE] = time.replace('\t', ' ');
126 tmpTuple[BODY] = body.replaceAll("[\t\n]", " ");
129 while (tuple != null) {
132 } catch (InterruptedException ie) {
133 throw new ThreadInterruptedException(ie);
142 body = contents.toString();
143 //workaround that startswith doesn't have an ignore case option, get at least 20 chars.
144 String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
145 if (startsWith.startsWith("#redirect")) {
150 time = time(contents.toString());
153 title = contents.toString();
156 //the doc id is the first one in the page. All other ids after that one can be ignored according to the schema
158 id = contents.toString();
162 // this element should be discarded.
169 XMLReader reader = XMLReaderFactory.createXMLReader();
170 reader.setContentHandler(this);
171 reader.setErrorHandler(this);
173 final InputStream localFileIS = is;
175 reader.parse(new InputSource(localFileIS));
176 } catch (IOException ioe) {
177 synchronized(EnwikiContentSource.this) {
178 if (localFileIS != is) {
179 // fileIS was closed on us, so, just fall
188 nmde = new NoMoreDataException();
191 } else if (localFileIS == is) {
192 // If file is not already re-opened then re-open it now
193 is = StreamUtils.inputStream(file);
197 } catch (SAXException sae) {
198 throw new RuntimeException(sae);
199 } catch (IOException ioe) {
200 throw new RuntimeException(ioe);
210 public void startElement(String namespace, String simple, String qualified,
211 Attributes attributes) {
212 int elemType = getElementType(qualified);
220 // intentional fall-through.
225 contents.setLength(0);
228 // this element should be discarded.
233 private static final Map<String,Integer> ELEMENTS = new HashMap<String,Integer>();
234 private static final int TITLE = 0;
235 private static final int DATE = TITLE + 1;
236 private static final int BODY = DATE + 1;
237 private static final int ID = BODY + 1;
238 private static final int LENGTH = ID + 1;
239 // LENGTH is used as the size of the tuple, so whatever constants we need that
240 // should not be part of the tuple, we should define them after LENGTH.
241 private static final int PAGE = LENGTH + 1;
243 private static final String[] months = {"JAN", "FEB", "MAR", "APR",
244 "MAY", "JUN", "JUL", "AUG",
245 "SEP", "OCT", "NOV", "DEC"};
248 ELEMENTS.put("page", Integer.valueOf(PAGE));
249 ELEMENTS.put("text", Integer.valueOf(BODY));
250 ELEMENTS.put("timestamp", Integer.valueOf(DATE));
251 ELEMENTS.put("title", Integer.valueOf(TITLE));
252 ELEMENTS.put("id", Integer.valueOf(ID));
256 * Returns the type of the element if defined, otherwise returns -1. This
257 * method is useful in startElement and endElement, by not needing to compare
258 * the element qualified name over and over.
260 private final static int getElementType(String elem) {
261 Integer val = ELEMENTS.get(elem);
262 return val == null ? -1 : val.intValue();
266 private boolean keepImages = true;
267 private InputStream is;
268 private Parser parser = new Parser();
271 public void close() throws IOException {
272 synchronized (EnwikiContentSource.this) {
281 public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
282 String[] tuple = parser.next();
284 docData.setName(tuple[ID]);
285 docData.setBody(tuple[BODY]);
286 docData.setDate(tuple[DATE]);
287 docData.setTitle(tuple[TITLE]);
292 public void resetInputs() throws IOException {
294 is = StreamUtils.inputStream(file);
298 public void setConfig(Config config) {
299 super.setConfig(config);
300 keepImages = config.get("keep.image.only.docs", true);
301 String fileName = config.get("docs.file", null);
302 if (fileName == null) {
303 throw new IllegalArgumentException("docs.file must be set");
305 file = new File(fileName).getAbsoluteFile();