lucene-java-3.5.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java

   1 package org.apache.lucene.benchmark.utils;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.File;
  21 import java.io.FileWriter;
  22 import java.io.IOException;
  23 import java.util.Properties;
  24
  25 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
  26 import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
  27 import org.apache.lucene.benchmark.byTask.utils.Config;
  28 import org.apache.lucene.document.Document;
  29
  30 /**
  31  * Extract the downloaded Wikipedia dump into separate files for indexing.
  32  */
  33 public class ExtractWikipedia {
  34
  35   private File outputDir;
  36
  37   static public int count = 0;
  38
  39   static final int BASE = 10;
  40   protected DocMaker docMaker;
  41
  42   public ExtractWikipedia(DocMaker docMaker, File outputDir) {
  43     this.outputDir = outputDir;
  44     this.docMaker = docMaker;
  45     System.out.println("Deleting all files in " + outputDir);
  46     File[] files = outputDir.listFiles();
  47     for (int i = 0; i < files.length; i++) {
  48       files[i].delete();
  49     }
  50   }
  51
  52   public File directory(int count, File directory) {
  53     if (directory == null) {
  54       directory = outputDir;
  55     }
  56     int base = BASE;
  57     while (base <= count) {
  58       base *= BASE;
  59     }
  60     if (count < BASE) {
  61       return directory;
  62     }
  63     directory = new File(directory, (Integer.toString(base / BASE)));
  64     directory = new File(directory, (Integer.toString(count / (base / BASE))));
  65     return directory(count % (base / BASE), directory);
  66   }
  67
  68   public void create(String id, String title, String time, String body) {
  69
  70     File d = directory(count++, null);
  71     d.mkdirs();
  72     File f = new File(d, id + ".txt");
  73
  74     StringBuilder contents = new StringBuilder();
  75
  76     contents.append(time);
  77     contents.append("\n\n");
  78     contents.append(title);
  79     contents.append("\n\n");
  80     contents.append(body);
  81     contents.append("\n");
  82
  83     try {
  84       FileWriter writer = new FileWriter(f);
  85       writer.write(contents.toString());
  86       writer.close();
  87     } catch (IOException ioe) {
  88       throw new RuntimeException(ioe);
  89     }
  90
  91   }
  92
  93   public void extract() throws Exception {
  94     Document doc = null;
  95     System.out.println("Starting Extraction");
  96     long start = System.currentTimeMillis();
  97     try {
  98       while ((doc = docMaker.makeDocument()) != null) {
  99         create(doc.get(DocMaker.ID_FIELD), doc.get(DocMaker.TITLE_FIELD), doc
 100             .get(DocMaker.DATE_FIELD), doc.get(DocMaker.BODY_FIELD));
 101       }
 102     } catch (NoMoreDataException e) {
 103       //continue
 104     }
 105     long finish = System.currentTimeMillis();
 106     System.out.println("Extraction took " + (finish - start) + " ms");
 107   }
 108
 109   public static void main(String[] args) throws Exception {
 110
 111     File wikipedia = null;
 112     File outputDir = new File("./enwiki");
 113     boolean keepImageOnlyDocs = true;
 114     for (int i = 0; i < args.length; i++) {
 115       String arg = args[i];
 116       if (arg.equals("--input") || arg.equals("-i")) {
 117         wikipedia = new File(args[i + 1]);
 118         i++;
 119       } else if (arg.equals("--output") || arg.equals("-o")) {
 120         outputDir = new File(args[i + 1]);
 121         i++;
 122       } else if (arg.equals("--discardImageOnlyDocs") || arg.equals("-d")) {
 123         keepImageOnlyDocs = false;
 124       }
 125
 126     }
 127     DocMaker docMaker = new DocMaker();
 128     Properties properties = new Properties();
 129     properties.setProperty("content.source", "org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource");
 130     properties.setProperty("docs.file", wikipedia.getAbsolutePath());
 131     properties.setProperty("content.source.forever", "false");
 132     properties.setProperty("keep.image.only.docs", String.valueOf(keepImageOnlyDocs));
 133     docMaker.setConfig(new Config(properties));
 134     docMaker.resetInputs();
 135     if (wikipedia.exists()) {
 136       System.out.println("Extracting Wikipedia to: " + outputDir + " using EnwikiContentSource");
 137       outputDir.mkdirs();
 138       ExtractWikipedia extractor = new ExtractWikipedia(docMaker, outputDir);
 139       extractor.extract();
 140     } else {
 141       printUsage();
 142     }
 143   }
 144
 145   private static void printUsage() {
 146     System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia --input|-i <Path to Wikipedia XML file> " +
 147             "[--output|-o <Output Path>] [--discardImageOnlyDocs|-d]");
 148     System.err.println("--discardImageOnlyDocs tells the extractor to skip Wiki docs that contain only images");
 149   }
 150
 151 }