lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java

   1 package org.apache.lucene.benchmark.byTask.feeds;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.File;
  21 import java.io.IOException;
  22 import java.io.InputStream;
  23 import java.util.HashMap;
  24 import java.util.Map;
  25
  26 import org.apache.lucene.benchmark.byTask.utils.Config;
  27 import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
  28 import org.apache.lucene.util.ThreadInterruptedException;
  29 import org.xml.sax.Attributes;
  30 import org.xml.sax.InputSource;
  31 import org.xml.sax.SAXException;
  32 import org.xml.sax.XMLReader;
  33 import org.xml.sax.helpers.DefaultHandler;
  34 import org.xml.sax.helpers.XMLReaderFactory;
  35
  36 /**
  37  * A {@link ContentSource} which reads the English Wikipedia dump. You can read
  38  * the .bz2 file directly (it will be decompressed on the fly). Config
  39  * properties:
  40  * <ul>
  41  * <li>keep.image.only.docs=false|true (default <b>true</b>).
  42  * <li>docs.file=&lt;path to the file&gt;
  43  * </ul>
  44  */
  45 public class EnwikiContentSource extends ContentSource {
  46
  47   private class Parser extends DefaultHandler implements Runnable {
  48     private Thread t;
  49     private boolean threadDone;
  50     private String[] tuple;
  51     private NoMoreDataException nmde;
  52     private StringBuilder contents = new StringBuilder();
  53     private String title;
  54     private String body;
  55     private String time;
  56     private String id;
  57
  58     String[] next() throws NoMoreDataException {
  59       if (t == null) {
  60         threadDone = false;
  61         t = new Thread(this);
  62         t.setDaemon(true);
  63         t.start();
  64       }
  65       String[] result;
  66       synchronized(this){
  67         while(tuple == null && nmde == null && !threadDone) {
  68           try {
  69             wait();
  70           } catch (InterruptedException ie) {
  71             throw new ThreadInterruptedException(ie);
  72           }
  73         }
  74         if (nmde != null) {
  75           // Set to null so we will re-start thread in case
  76           // we are re-used:
  77           t = null;
  78           throw nmde;
  79         }
  80         if (t != null && threadDone) {
  81           // The thread has exited yet did not hit end of
  82           // data, so this means it hit an exception.  We
  83           // throw NoMorDataException here to force
  84           // benchmark to stop the current alg:
  85           throw new NoMoreDataException();
  86         }
  87         result = tuple;
  88         tuple = null;
  89         notify();
  90       }
  91       return result;
  92     }
  93
  94     String time(String original) {
  95       StringBuilder buffer = new StringBuilder();
  96
  97       buffer.append(original.substring(8, 10));
  98       buffer.append('-');
  99       buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
 100       buffer.append('-');
 101       buffer.append(original.substring(0, 4));
 102       buffer.append(' ');
 103       buffer.append(original.substring(11, 19));
 104       buffer.append(".000");
 105
 106       return buffer.toString();
 107     }
 108
 109     @Override
 110     public void characters(char[] ch, int start, int length) {
 111       contents.append(ch, start, length);
 112     }
 113
 114     @Override
 115     public void endElement(String namespace, String simple, String qualified)
 116       throws SAXException {
 117       int elemType = getElementType(qualified);
 118       switch (elemType) {
 119         case PAGE:
 120           // the body must be null and we either are keeping image docs or the
 121           // title does not start with Image:
 122           if (body != null && (keepImages || !title.startsWith("Image:"))) {
 123             String[] tmpTuple = new String[LENGTH];
 124             tmpTuple[TITLE] = title.replace('\t', ' ');
 125             tmpTuple[DATE] = time.replace('\t', ' ');
 126             tmpTuple[BODY] = body.replaceAll("[\t\n]", " ");
 127             tmpTuple[ID] = id;
 128             synchronized(this) {
 129               while (tuple != null) {
 130                 try {
 131                   wait();
 132                 } catch (InterruptedException ie) {
 133                   throw new ThreadInterruptedException(ie);
 134                 }
 135               }
 136               tuple = tmpTuple;
 137               notify();
 138             }
 139           }
 140           break;
 141         case BODY:
 142           body = contents.toString();
 143           //workaround that startswith doesn't have an ignore case option, get at least 20 chars.
 144           String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
 145           if (startsWith.startsWith("#redirect")) {
 146             body = null;
 147           }
 148           break;
 149         case DATE:
 150           time = time(contents.toString());
 151           break;
 152         case TITLE:
 153           title = contents.toString();
 154           break;
 155         case ID:
 156           //the doc id is the first one in the page.  All other ids after that one can be ignored according to the schema
 157           if (id == null) {
 158             id = contents.toString();
 159           }
 160           break;
 161         default:
 162           // this element should be discarded.
 163       }
 164     }
 165
 166     public void run() {
 167
 168       try {
 169         XMLReader reader = XMLReaderFactory.createXMLReader();
 170         reader.setContentHandler(this);
 171         reader.setErrorHandler(this);
 172         while(true){
 173           final InputStream localFileIS = is;
 174           try {
 175             reader.parse(new InputSource(localFileIS));
 176           } catch (IOException ioe) {
 177             synchronized(EnwikiContentSource.this) {
 178               if (localFileIS != is) {
 179                 // fileIS was closed on us, so, just fall
 180                 // through
 181               } else
 182                 // Exception is real
 183                 throw ioe;
 184             }
 185           }
 186           synchronized(this) {
 187             if (!forever) {
 188               nmde = new NoMoreDataException();
 189               notify();
 190               return;
 191             } else if (localFileIS == is) {
 192               // If file is not already re-opened then re-open it now
 193               is = StreamUtils.inputStream(file);
 194             }
 195           }
 196         }
 197       } catch (SAXException sae) {
 198         throw new RuntimeException(sae);
 199       } catch (IOException ioe) {
 200         throw new RuntimeException(ioe);
 201       } finally {
 202         synchronized(this) {
 203           threadDone = true;
 204           notify();
 205         }
 206       }
 207     }
 208
 209     @Override
 210     public void startElement(String namespace, String simple, String qualified,
 211                              Attributes attributes) {
 212       int elemType = getElementType(qualified);
 213       switch (elemType) {
 214         case PAGE:
 215           title = null;
 216           body = null;
 217           time = null;
 218           id = null;
 219           break;
 220         // intentional fall-through.
 221         case BODY:
 222         case DATE:
 223         case TITLE:
 224         case ID:
 225           contents.setLength(0);
 226           break;
 227         default:
 228           // this element should be discarded.
 229       }
 230     }
 231   }
 232
 233   private static final Map<String,Integer> ELEMENTS = new HashMap<String,Integer>();
 234   private static final int TITLE = 0;
 235   private static final int DATE = TITLE + 1;
 236   private static final int BODY = DATE + 1;
 237   private static final int ID = BODY + 1;
 238   private static final int LENGTH = ID + 1;
 239   // LENGTH is used as the size of the tuple, so whatever constants we need that
 240   // should not be part of the tuple, we should define them after LENGTH.
 241   private static final int PAGE = LENGTH + 1;
 242
 243   private static final String[] months = {"JAN", "FEB", "MAR", "APR",
 244                                   "MAY", "JUN", "JUL", "AUG",
 245                                   "SEP", "OCT", "NOV", "DEC"};
 246
 247   static {
 248     ELEMENTS.put("page", Integer.valueOf(PAGE));
 249     ELEMENTS.put("text", Integer.valueOf(BODY));
 250     ELEMENTS.put("timestamp", Integer.valueOf(DATE));
 251     ELEMENTS.put("title", Integer.valueOf(TITLE));
 252     ELEMENTS.put("id", Integer.valueOf(ID));
 253   }
 254
 255   /**
 256    * Returns the type of the element if defined, otherwise returns -1. This
 257    * method is useful in startElement and endElement, by not needing to compare
 258    * the element qualified name over and over.
 259    */
 260   private final static int getElementType(String elem) {
 261     Integer val = ELEMENTS.get(elem);
 262     return val == null ? -1 : val.intValue();
 263   }
 264
 265   private File file;
 266   private boolean keepImages = true;
 267   private InputStream is;
 268   private Parser parser = new Parser();
 269
 270   @Override
 271   public void close() throws IOException {
 272     synchronized (EnwikiContentSource.this) {
 273       if (is != null) {
 274         is.close();
 275         is = null;
 276       }
 277     }
 278   }
 279
 280   @Override
 281   public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
 282     String[] tuple = parser.next();
 283     docData.clear();
 284     docData.setName(tuple[ID]);
 285     docData.setBody(tuple[BODY]);
 286     docData.setDate(tuple[DATE]);
 287     docData.setTitle(tuple[TITLE]);
 288     return docData;
 289   }
 290
 291   @Override
 292   public void resetInputs() throws IOException {
 293     super.resetInputs();
 294     is = StreamUtils.inputStream(file);
 295   }
 296
 297   @Override
 298   public void setConfig(Config config) {
 299     super.setConfig(config);
 300     keepImages = config.get("keep.image.only.docs", true);
 301     String fileName = config.get("docs.file", null);
 302     if (fileName == null) {
 303       throw new IllegalArgumentException("docs.file must be set");
 304     }
 305     file = new File(fileName).getAbsoluteFile();
 306   }
 307
 308 }