1 package org.apache.lucene.benchmark.byTask.feeds;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.util.HashMap;
26 import org.apache.lucene.benchmark.byTask.utils.Config;
27 import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
28 import org.apache.lucene.util.ThreadInterruptedException;
29 import org.xml.sax.Attributes;
30 import org.xml.sax.InputSource;
31 import org.xml.sax.SAXException;
32 import org.xml.sax.XMLReader;
33 import org.xml.sax.helpers.DefaultHandler;
34 import org.xml.sax.helpers.XMLReaderFactory;
37 * A {@link ContentSource} which reads the English Wikipedia dump. You can read
38 * the .bz2 file directly (it will be decompressed on the fly). Config
41 * <li>keep.image.only.docs=false|true (default <b>true</b>).
42 * <li>docs.file=<path to the file>
45 public class EnwikiContentSource extends ContentSource {
47 @SuppressWarnings("synthetic-access")
48 private class Parser extends DefaultHandler implements Runnable {
50 private boolean threadDone;
51 private String[] tuple;
52 private NoMoreDataException nmde;
53 private StringBuilder contents = new StringBuilder();
62 String[] next() throws NoMoreDataException {
71 while(tuple == null && nmde == null && !threadDone) {
74 } catch (InterruptedException ie) {
75 throw new ThreadInterruptedException(ie);
79 // Set to null so we will re-start thread in case
84 if (t != null && threadDone) {
85 // The thread has exited yet did not hit end of
86 // data, so this means it hit an exception. We
87 // throw NoMorDataException here to force
88 // benchmark to stop the current alg:
89 throw new NoMoreDataException();
98 String time(String original) {
99 StringBuilder buffer = new StringBuilder();
101 buffer.append(original.substring(8, 10));
103 buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
105 buffer.append(original.substring(0, 4));
107 buffer.append(original.substring(11, 19));
108 buffer.append(".000");
110 return buffer.toString();
114 public void characters(char[] ch, int start, int length) {
115 contents.append(ch, start, length);
119 public void endElement(String namespace, String simple, String qualified)
120 throws SAXException {
121 int elemType = getElementType(qualified);
124 // the body must be null and we either are keeping image docs or the
125 // title does not start with Image:
126 if (body != null && (keepImages || !title.startsWith("Image:"))) {
127 String[] tmpTuple = new String[LENGTH];
128 tmpTuple[TITLE] = title.replace('\t', ' ');
129 tmpTuple[DATE] = time.replace('\t', ' ');
130 tmpTuple[BODY] = body.replaceAll("[\t\n]", " ");
133 while (tuple != null) {
136 } catch (InterruptedException ie) {
137 throw new ThreadInterruptedException(ie);
146 body = contents.toString();
147 //workaround that startswith doesn't have an ignore case option, get at least 20 chars.
148 String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
149 if (startsWith.startsWith("#redirect")) {
154 time = time(contents.toString());
157 title = contents.toString();
160 //the doc id is the first one in the page. All other ids after that one can be ignored according to the schema
162 id = contents.toString();
166 // this element should be discarded.
173 XMLReader reader = XMLReaderFactory.createXMLReader();
174 reader.setContentHandler(this);
175 reader.setErrorHandler(this);
177 final InputStream localFileIS = is;
179 reader.parse(new InputSource(localFileIS));
180 } catch (IOException ioe) {
181 synchronized(EnwikiContentSource.this) {
182 if (localFileIS != is) {
183 // fileIS was closed on us, so, just fall
192 nmde = new NoMoreDataException();
195 } else if (localFileIS == is) {
196 // If file is not already re-opened then re-open it now
197 is = StreamUtils.inputStream(file);
201 } catch (SAXException sae) {
202 throw new RuntimeException(sae);
203 } catch (IOException ioe) {
204 throw new RuntimeException(ioe);
214 public void startElement(String namespace, String simple, String qualified,
215 Attributes attributes) {
216 int elemType = getElementType(qualified);
224 // intentional fall-through.
229 contents.setLength(0);
232 // this element should be discarded.
237 private static final Map<String,Integer> ELEMENTS = new HashMap<String,Integer>();
238 private static final int TITLE = 0;
239 private static final int DATE = TITLE + 1;
240 private static final int BODY = DATE + 1;
241 private static final int ID = BODY + 1;
242 private static final int LENGTH = ID + 1;
243 // LENGTH is used as the size of the tuple, so whatever constants we need that
244 // should not be part of the tuple, we should define them after LENGTH.
245 private static final int PAGE = LENGTH + 1;
247 private static final String[] months = {"JAN", "FEB", "MAR", "APR",
248 "MAY", "JUN", "JUL", "AUG",
249 "SEP", "OCT", "NOV", "DEC"};
252 ELEMENTS.put("page", Integer.valueOf(PAGE));
253 ELEMENTS.put("text", Integer.valueOf(BODY));
254 ELEMENTS.put("timestamp", Integer.valueOf(DATE));
255 ELEMENTS.put("title", Integer.valueOf(TITLE));
256 ELEMENTS.put("id", Integer.valueOf(ID));
260 * Returns the type of the element if defined, otherwise returns -1. This
261 * method is useful in startElement and endElement, by not needing to compare
262 * the element qualified name over and over.
264 private final static int getElementType(String elem) {
265 Integer val = ELEMENTS.get(elem);
266 return val == null ? -1 : val.intValue();
270 private boolean keepImages = true;
271 private InputStream is;
272 private Parser parser = new Parser();
275 public void close() throws IOException {
276 synchronized (EnwikiContentSource.this) {
285 public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
286 String[] tuple = parser.next();
288 docData.setName(tuple[ID]);
289 docData.setBody(tuple[BODY]);
290 docData.setDate(tuple[DATE]);
291 docData.setTitle(tuple[TITLE]);
296 public void resetInputs() throws IOException {
298 is = StreamUtils.inputStream(file);
302 public void setConfig(Config config) {
303 super.setConfig(config);
304 keepImages = config.get("keep.image.only.docs", true);
305 String fileName = config.get("docs.file", null);
306 if (fileName == null) {
307 throw new IllegalArgumentException("docs.file must be set");
309 file = new File(fileName).getAbsoluteFile();