lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java

   1 package org.apache.lucene.benchmark.utils;
   2
   3 /**
   4  * Copyright 2005 The Apache Software Foundation
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  * you may not use this file except in compliance with the License.
   8  * You may obtain a copy of the License at
   9  *
  10  *     http://www.apache.org/licenses/LICENSE-2.0
  11  *
  12  * Unless required by applicable law or agreed to in writing, software
  13  * distributed under the License is distributed on an "AS IS" BASIS,
  14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15  * See the License for the specific language governing permissions and
  16  * limitations under the License.
  17  */
  18
  19 import java.io.BufferedReader;
  20 import java.io.File;
  21 import java.io.FileFilter;
  22 import java.io.FileReader;
  23 import java.io.FileWriter;
  24 import java.io.IOException;
  25 import java.util.regex.Matcher;
  26 import java.util.regex.Pattern;
  27
  28 /**
  29  * Split the Reuters SGML documents into Simple Text files containing: Title,
  30  * Date, Dateline, Body
  31  */
  32 public class ExtractReuters {
  33   private File reutersDir;
  34   private File outputDir;
  35   private static final String LINE_SEPARATOR = System.getProperty("line.separator");
  36
  37   public ExtractReuters(File reutersDir, File outputDir) {
  38     this.reutersDir = reutersDir;
  39     this.outputDir = outputDir;
  40     System.out.println("Deleting all files in " + outputDir);
  41     for (File f : outputDir.listFiles()) {
  42       f.delete();
  43     }
  44   }
  45
  46   public void extract() {
  47     File[] sgmFiles = reutersDir.listFiles(new FileFilter() {
  48       public boolean accept(File file) {
  49         return file.getName().endsWith(".sgm");
  50       }
  51     });
  52     if (sgmFiles != null && sgmFiles.length > 0) {
  53       for (File sgmFile : sgmFiles) {
  54         extractFile(sgmFile);
  55       }
  56     } else {
  57       System.err.println("No .sgm files in " + reutersDir);
  58     }
  59   }
  60
  61   Pattern EXTRACTION_PATTERN = Pattern
  62       .compile("<TITLE>(.*?)</TITLE>|<DATE>(.*?)</DATE>|<BODY>(.*?)</BODY>");
  63
  64   private static String[] META_CHARS = { "&", "<", ">", "\"", "'" };
  65
  66   private static String[] META_CHARS_SERIALIZATIONS = { "&amp;", "&lt;",
  67       "&gt;", "&quot;", "&apos;" };
  68
  69   /**
  70    * Override if you wish to change what is extracted
  71    *
  72    * @param sgmFile
  73    */
  74   protected void extractFile(File sgmFile) {
  75     try {
  76       BufferedReader reader = new BufferedReader(new FileReader(sgmFile));
  77
  78       StringBuilder buffer = new StringBuilder(1024);
  79       StringBuilder outBuffer = new StringBuilder(1024);
  80
  81       String line = null;
  82       int docNumber = 0;
  83       while ((line = reader.readLine()) != null) {
  84         // when we see a closing reuters tag, flush the file
  85
  86         if (line.indexOf("</REUTERS") == -1) {
  87           // Replace the SGM escape sequences
  88
  89           buffer.append(line).append(' ');// accumulate the strings for now,
  90                                           // then apply regular expression to
  91                                           // get the pieces,
  92         } else {
  93           // Extract the relevant pieces and write to a file in the output dir
  94           Matcher matcher = EXTRACTION_PATTERN.matcher(buffer);
  95           while (matcher.find()) {
  96             for (int i = 1; i <= matcher.groupCount(); i++) {
  97               if (matcher.group(i) != null) {
  98                 outBuffer.append(matcher.group(i));
  99               }
 100             }
 101             outBuffer.append(LINE_SEPARATOR).append(LINE_SEPARATOR);
 102           }
 103           String out = outBuffer.toString();
 104           for (int i = 0; i < META_CHARS_SERIALIZATIONS.length; i++) {
 105             out = out.replaceAll(META_CHARS_SERIALIZATIONS[i], META_CHARS[i]);
 106           }
 107           File outFile = new File(outputDir, sgmFile.getName() + "-"
 108               + (docNumber++) + ".txt");
 109           // System.out.println("Writing " + outFile);
 110           FileWriter writer = new FileWriter(outFile);
 111           writer.write(out);
 112           writer.close();
 113           outBuffer.setLength(0);
 114           buffer.setLength(0);
 115         }
 116       }
 117       reader.close();
 118     } catch (IOException e) {
 119       throw new RuntimeException(e);
 120     }
 121   }
 122
 123   public static void main(String[] args) {
 124     if (args.length != 2) {
 125       usage("Wrong number of arguments ("+args.length+")");
 126       return;
 127     }
 128     File reutersDir = new File(args[0]);
 129     if (!reutersDir.exists()) {
 130       usage("Cannot find Path to Reuters SGM files ("+reutersDir+")");
 131       return;
 132     }
 133
 134     // First, extract to a tmp directory and only if everything succeeds, rename
 135     // to output directory.
 136     File outputDir = new File(args[1]);
 137     outputDir = new File(outputDir.getAbsolutePath() + "-tmp");
 138     outputDir.mkdirs();
 139     ExtractReuters extractor = new ExtractReuters(reutersDir, outputDir);
 140     extractor.extract();
 141     // Now rename to requested output dir
 142     outputDir.renameTo(new File(args[1]));
 143   }
 144
 145   private static void usage(String msg) {
 146     System.err.println("Usage: "+msg+" :: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters <Path to Reuters SGM files> <Output Path>");
 147   }
 148
 149 }