lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java

   1 package org.apache.lucene.benchmark.byTask.feeds;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.File;
  21 import java.io.IOException;
  22 import java.io.InputStream;
  23 import java.util.HashMap;
  24 import java.util.Map;
  25
  26 import org.apache.lucene.benchmark.byTask.utils.Config;
  27 import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
  28 import org.apache.lucene.util.ThreadInterruptedException;
  29 import org.xml.sax.Attributes;
  30 import org.xml.sax.InputSource;
  31 import org.xml.sax.SAXException;
  32 import org.xml.sax.XMLReader;
  33 import org.xml.sax.helpers.DefaultHandler;
  34 import org.xml.sax.helpers.XMLReaderFactory;
  35
  36 /**
  37  * A {@link ContentSource} which reads the English Wikipedia dump. You can read
  38  * the .bz2 file directly (it will be decompressed on the fly). Config
  39  * properties:
  40  * <ul>
  41  * <li>keep.image.only.docs=false|true (default <b>true</b>).
  42  * <li>docs.file=&lt;path to the file&gt;
  43  * </ul>
  44  */
  45 public class EnwikiContentSource extends ContentSource {
  46
  47   @SuppressWarnings("synthetic-access")
  48   private class Parser extends DefaultHandler implements Runnable {
  49     private Thread t;
  50     private boolean threadDone;
  51     private String[] tuple;
  52     private NoMoreDataException nmde;
  53     private StringBuilder contents = new StringBuilder();
  54     private String title;
  55     private String body;
  56     private String time;
  57     private String id;
  58
  59     public Parser() {
  60     }
  61
  62     String[] next() throws NoMoreDataException {
  63       if (t == null) {
  64         threadDone = false;
  65         t = new Thread(this);
  66         t.setDaemon(true);
  67         t.start();
  68       }
  69       String[] result;
  70       synchronized(this){
  71         while(tuple == null && nmde == null && !threadDone) {
  72           try {
  73             wait();
  74           } catch (InterruptedException ie) {
  75             throw new ThreadInterruptedException(ie);
  76           }
  77         }
  78         if (nmde != null) {
  79           // Set to null so we will re-start thread in case
  80           // we are re-used:
  81           t = null;
  82           throw nmde;
  83         }
  84         if (t != null && threadDone) {
  85           // The thread has exited yet did not hit end of
  86           // data, so this means it hit an exception.  We
  87           // throw NoMorDataException here to force
  88           // benchmark to stop the current alg:
  89           throw new NoMoreDataException();
  90         }
  91         result = tuple;
  92         tuple = null;
  93         notify();
  94       }
  95       return result;
  96     }
  97
  98     String time(String original) {
  99       StringBuilder buffer = new StringBuilder();
 100
 101       buffer.append(original.substring(8, 10));
 102       buffer.append('-');
 103       buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
 104       buffer.append('-');
 105       buffer.append(original.substring(0, 4));
 106       buffer.append(' ');
 107       buffer.append(original.substring(11, 19));
 108       buffer.append(".000");
 109
 110       return buffer.toString();
 111     }
 112
 113     @Override
 114     public void characters(char[] ch, int start, int length) {
 115       contents.append(ch, start, length);
 116     }
 117
 118     @Override
 119     public void endElement(String namespace, String simple, String qualified)
 120       throws SAXException {
 121       int elemType = getElementType(qualified);
 122       switch (elemType) {
 123         case PAGE:
 124           // the body must be null and we either are keeping image docs or the
 125           // title does not start with Image:
 126           if (body != null && (keepImages || !title.startsWith("Image:"))) {
 127             String[] tmpTuple = new String[LENGTH];
 128             tmpTuple[TITLE] = title.replace('\t', ' ');
 129             tmpTuple[DATE] = time.replace('\t', ' ');
 130             tmpTuple[BODY] = body.replaceAll("[\t\n]", " ");
 131             tmpTuple[ID] = id;
 132             synchronized(this) {
 133               while (tuple != null) {
 134                 try {
 135                   wait();
 136                 } catch (InterruptedException ie) {
 137                   throw new ThreadInterruptedException(ie);
 138                 }
 139               }
 140               tuple = tmpTuple;
 141               notify();
 142             }
 143           }
 144           break;
 145         case BODY:
 146           body = contents.toString();
 147           //workaround that startswith doesn't have an ignore case option, get at least 20 chars.
 148           String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
 149           if (startsWith.startsWith("#redirect")) {
 150             body = null;
 151           }
 152           break;
 153         case DATE:
 154           time = time(contents.toString());
 155           break;
 156         case TITLE:
 157           title = contents.toString();
 158           break;
 159         case ID:
 160           //the doc id is the first one in the page.  All other ids after that one can be ignored according to the schema
 161           if (id == null) {
 162             id = contents.toString();
 163           }
 164           break;
 165         default:
 166           // this element should be discarded.
 167       }
 168     }
 169
 170     public void run() {
 171
 172       try {
 173         XMLReader reader = XMLReaderFactory.createXMLReader();
 174         reader.setContentHandler(this);
 175         reader.setErrorHandler(this);
 176         while(true){
 177           final InputStream localFileIS = is;
 178           try {
 179             reader.parse(new InputSource(localFileIS));
 180           } catch (IOException ioe) {
 181             synchronized(EnwikiContentSource.this) {
 182               if (localFileIS != is) {
 183                 // fileIS was closed on us, so, just fall
 184                 // through
 185               } else
 186                 // Exception is real
 187                 throw ioe;
 188             }
 189           }
 190           synchronized(this) {
 191             if (!forever) {
 192               nmde = new NoMoreDataException();
 193               notify();
 194               return;
 195             } else if (localFileIS == is) {
 196               // If file is not already re-opened then re-open it now
 197               is = StreamUtils.inputStream(file);
 198             }
 199           }
 200         }
 201       } catch (SAXException sae) {
 202         throw new RuntimeException(sae);
 203       } catch (IOException ioe) {
 204         throw new RuntimeException(ioe);
 205       } finally {
 206         synchronized(this) {
 207           threadDone = true;
 208           notify();
 209         }
 210       }
 211     }
 212
 213     @Override
 214     public void startElement(String namespace, String simple, String qualified,
 215                              Attributes attributes) {
 216       int elemType = getElementType(qualified);
 217       switch (elemType) {
 218         case PAGE:
 219           title = null;
 220           body = null;
 221           time = null;
 222           id = null;
 223           break;
 224         // intentional fall-through.
 225         case BODY:
 226         case DATE:
 227         case TITLE:
 228         case ID:
 229           contents.setLength(0);
 230           break;
 231         default:
 232           // this element should be discarded.
 233       }
 234     }
 235   }
 236
 237   private static final Map<String,Integer> ELEMENTS = new HashMap<String,Integer>();
 238   private static final int TITLE = 0;
 239   private static final int DATE = TITLE + 1;
 240   private static final int BODY = DATE + 1;
 241   private static final int ID = BODY + 1;
 242   private static final int LENGTH = ID + 1;
 243   // LENGTH is used as the size of the tuple, so whatever constants we need that
 244   // should not be part of the tuple, we should define them after LENGTH.
 245   private static final int PAGE = LENGTH + 1;
 246
 247   private static final String[] months = {"JAN", "FEB", "MAR", "APR",
 248                                   "MAY", "JUN", "JUL", "AUG",
 249                                   "SEP", "OCT", "NOV", "DEC"};
 250
 251   static {
 252     ELEMENTS.put("page", Integer.valueOf(PAGE));
 253     ELEMENTS.put("text", Integer.valueOf(BODY));
 254     ELEMENTS.put("timestamp", Integer.valueOf(DATE));
 255     ELEMENTS.put("title", Integer.valueOf(TITLE));
 256     ELEMENTS.put("id", Integer.valueOf(ID));
 257   }
 258
 259   /**
 260    * Returns the type of the element if defined, otherwise returns -1. This
 261    * method is useful in startElement and endElement, by not needing to compare
 262    * the element qualified name over and over.
 263    */
 264   private final static int getElementType(String elem) {
 265     Integer val = ELEMENTS.get(elem);
 266     return val == null ? -1 : val.intValue();
 267   }
 268
 269   private File file;
 270   private boolean keepImages = true;
 271   private InputStream is;
 272   private Parser parser = new Parser();
 273
 274   @Override
 275   public void close() throws IOException {
 276     synchronized (EnwikiContentSource.this) {
 277       if (is != null) {
 278         is.close();
 279         is = null;
 280       }
 281     }
 282   }
 283
 284   @Override
 285   public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
 286     String[] tuple = parser.next();
 287     docData.clear();
 288     docData.setName(tuple[ID]);
 289     docData.setBody(tuple[BODY]);
 290     docData.setDate(tuple[DATE]);
 291     docData.setTitle(tuple[TITLE]);
 292     return docData;
 293   }
 294
 295   @Override
 296   public void resetInputs() throws IOException {
 297     super.resetInputs();
 298     is = StreamUtils.inputStream(file);
 299   }
 300
 301   @Override
 302   public void setConfig(Config config) {
 303     super.setConfig(config);
 304     keepImages = config.get("keep.image.only.docs", true);
 305     String fileName = config.get("docs.file", null);
 306     if (fileName == null) {
 307       throw new IllegalArgumentException("docs.file must be set");
 308     }
 309     file = new File(fileName).getAbsoluteFile();
 310   }
 311
 312 }