1 package org.apache.lucene.benchmark.utils;
4 * Copyright 2005 The Apache Software Foundation
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
19 import java.io.BufferedReader;
21 import java.io.FileFilter;
22 import java.io.FileReader;
23 import java.io.FileWriter;
24 import java.io.IOException;
25 import java.util.regex.Matcher;
26 import java.util.regex.Pattern;
29 * Split the Reuters SGML documents into Simple Text files containing: Title,
30 * Date, Dateline, Body
32 public class ExtractReuters {
33 private File reutersDir;
34 private File outputDir;
35 private static final String LINE_SEPARATOR = System.getProperty("line.separator");
37 public ExtractReuters(File reutersDir, File outputDir) {
38 this.reutersDir = reutersDir;
39 this.outputDir = outputDir;
40 System.out.println("Deleting all files in " + outputDir);
41 for (File f : outputDir.listFiles()) {
46 public void extract() {
47 File[] sgmFiles = reutersDir.listFiles(new FileFilter() {
48 public boolean accept(File file) {
49 return file.getName().endsWith(".sgm");
52 if (sgmFiles != null && sgmFiles.length > 0) {
53 for (File sgmFile : sgmFiles) {
57 System.err.println("No .sgm files in " + reutersDir);
61 Pattern EXTRACTION_PATTERN = Pattern
62 .compile("<TITLE>(.*?)</TITLE>|<DATE>(.*?)</DATE>|<BODY>(.*?)</BODY>");
64 private static String[] META_CHARS = { "&", "<", ">", "\"", "'" };
66 private static String[] META_CHARS_SERIALIZATIONS = { "&", "<",
67 ">", """, "'" };
70 * Override if you wish to change what is extracted
74 protected void extractFile(File sgmFile) {
76 BufferedReader reader = new BufferedReader(new FileReader(sgmFile));
78 StringBuilder buffer = new StringBuilder(1024);
79 StringBuilder outBuffer = new StringBuilder(1024);
83 while ((line = reader.readLine()) != null) {
84 // when we see a closing reuters tag, flush the file
86 if (line.indexOf("</REUTERS") == -1) {
87 // Replace the SGM escape sequences
89 buffer.append(line).append(' ');// accumulate the strings for now,
90 // then apply regular expression to
93 // Extract the relevant pieces and write to a file in the output dir
94 Matcher matcher = EXTRACTION_PATTERN.matcher(buffer);
95 while (matcher.find()) {
96 for (int i = 1; i <= matcher.groupCount(); i++) {
97 if (matcher.group(i) != null) {
98 outBuffer.append(matcher.group(i));
101 outBuffer.append(LINE_SEPARATOR).append(LINE_SEPARATOR);
103 String out = outBuffer.toString();
104 for (int i = 0; i < META_CHARS_SERIALIZATIONS.length; i++) {
105 out = out.replaceAll(META_CHARS_SERIALIZATIONS[i], META_CHARS[i]);
107 File outFile = new File(outputDir, sgmFile.getName() + "-"
108 + (docNumber++) + ".txt");
109 // System.out.println("Writing " + outFile);
110 FileWriter writer = new FileWriter(outFile);
113 outBuffer.setLength(0);
118 } catch (IOException e) {
119 throw new RuntimeException(e);
123 public static void main(String[] args) {
124 if (args.length != 2) {
125 usage("Wrong number of arguments ("+args.length+")");
128 File reutersDir = new File(args[0]);
129 if (!reutersDir.exists()) {
130 usage("Cannot find Path to Reuters SGM files ("+reutersDir+")");
134 // First, extract to a tmp directory and only if everything succeeds, rename
135 // to output directory.
136 File outputDir = new File(args[1]);
137 outputDir = new File(outputDir.getAbsolutePath() + "-tmp");
139 ExtractReuters extractor = new ExtractReuters(reutersDir, outputDir);
141 // Now rename to requested output dir
142 outputDir.renameTo(new File(args[1]));
145 private static void usage(String msg) {
146 System.err.println("Usage: "+msg+" :: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters <Path to Reuters SGM files> <Output Path>");