1 package org.apache.lucene.benchmark.utils;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import java.io.FileWriter;
22 import java.io.IOException;
23 import java.util.Properties;
25 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
26 import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
27 import org.apache.lucene.benchmark.byTask.utils.Config;
28 import org.apache.lucene.document.Document;
31 * Extract the downloaded Wikipedia dump into separate files for indexing.
33 public class ExtractWikipedia {
35 private File outputDir;
37 static public int count = 0;
39 static final int BASE = 10;
40 protected DocMaker docMaker;
42 public ExtractWikipedia(DocMaker docMaker, File outputDir) {
43 this.outputDir = outputDir;
44 this.docMaker = docMaker;
45 System.out.println("Deleting all files in " + outputDir);
46 File[] files = outputDir.listFiles();
47 for (int i = 0; i < files.length; i++) {
52 public File directory(int count, File directory) {
53 if (directory == null) {
54 directory = outputDir;
57 while (base <= count) {
63 directory = new File(directory, (Integer.toString(base / BASE)));
64 directory = new File(directory, (Integer.toString(count / (base / BASE))));
65 return directory(count % (base / BASE), directory);
68 public void create(String id, String title, String time, String body) {
70 File d = directory(count++, null);
72 File f = new File(d, id + ".txt");
74 StringBuilder contents = new StringBuilder();
76 contents.append(time);
77 contents.append("\n\n");
78 contents.append(title);
79 contents.append("\n\n");
80 contents.append(body);
81 contents.append("\n");
84 FileWriter writer = new FileWriter(f);
85 writer.write(contents.toString());
87 } catch (IOException ioe) {
88 throw new RuntimeException(ioe);
93 public void extract() throws Exception {
95 System.out.println("Starting Extraction");
96 long start = System.currentTimeMillis();
98 while ((doc = docMaker.makeDocument()) != null) {
99 create(doc.get(DocMaker.ID_FIELD), doc.get(DocMaker.TITLE_FIELD), doc
100 .get(DocMaker.DATE_FIELD), doc.get(DocMaker.BODY_FIELD));
102 } catch (NoMoreDataException e) {
105 long finish = System.currentTimeMillis();
106 System.out.println("Extraction took " + (finish - start) + " ms");
109 public static void main(String[] args) throws Exception {
111 File wikipedia = null;
112 File outputDir = new File("./enwiki");
113 boolean keepImageOnlyDocs = true;
114 for (int i = 0; i < args.length; i++) {
115 String arg = args[i];
116 if (arg.equals("--input") || arg.equals("-i")) {
117 wikipedia = new File(args[i + 1]);
119 } else if (arg.equals("--output") || arg.equals("-o")) {
120 outputDir = new File(args[i + 1]);
122 } else if (arg.equals("--discardImageOnlyDocs") || arg.equals("-d")) {
123 keepImageOnlyDocs = false;
127 DocMaker docMaker = new DocMaker();
128 Properties properties = new Properties();
129 properties.setProperty("content.source", "org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource");
130 properties.setProperty("docs.file", wikipedia.getAbsolutePath());
131 properties.setProperty("content.source.forever", "false");
132 properties.setProperty("keep.image.only.docs", String.valueOf(keepImageOnlyDocs));
133 docMaker.setConfig(new Config(properties));
134 docMaker.resetInputs();
135 if (wikipedia.exists()) {
136 System.out.println("Extracting Wikipedia to: " + outputDir + " using EnwikiContentSource");
138 ExtractWikipedia extractor = new ExtractWikipedia(docMaker, outputDir);
145 private static void printUsage() {
146 System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia --input|-i <Path to Wikipedia XML file> " +
147 "[--output|-o <Output Path>] [--discardImageOnlyDocs|-d]");
148 System.err.println("--discardImageOnlyDocs tells the extractor to skip Wiki docs that contain only images");