+++ /dev/null
-/*
- Egothor Software License version 1.00
- Copyright (C) 1997-2004 Leo Galambos.
- Copyright (C) 2002-2004 "Egothor developers"
- on behalf of the Egothor Project.
- All rights reserved.
-
- This software is copyrighted by the "Egothor developers". If this
- license applies to a single file or document, the "Egothor developers"
- are the people or entities mentioned as copyright holders in that file
- or document. If this license applies to the Egothor project as a
- whole, the copyright holders are the people or entities mentioned in
- the file CREDITS. This file can be found in the same location as this
- license in the distribution.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- 1. Redistributions of source code must retain the above copyright
- notice, the list of contributors, this list of conditions, and the
- following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright
- notice, the list of contributors, this list of conditions, and the
- disclaimer that follows these conditions in the documentation
- and/or other materials provided with the distribution.
- 3. The name "Egothor" must not be used to endorse or promote products
- derived from this software without prior written permission. For
- written permission, please contact Leo.G@seznam.cz
- 4. Products derived from this software may not be called "Egothor",
- nor may "Egothor" appear in their name, without prior written
- permission from Leo.G@seznam.cz.
-
- In addition, we request that you include in the end-user documentation
- provided with the redistribution and/or in the software itself an
- acknowledgement equivalent to the following:
- "This product includes software developed by the Egothor Project.
- http://egothor.sf.net/"
-
- THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
- OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
- IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- This software consists of voluntary contributions made by many
- individuals on behalf of the Egothor Project and was originally
- created by Leo Galambos (Leo.G@seznam.cz).
- */
-package org.egothor.stemmer;
-
-import java.io.BufferedOutputStream;
-import java.io.BufferedReader;
-import java.io.DataOutputStream;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.LineNumberReader;
-import java.util.StringTokenizer;
-
-/**
- * The Compile class is used to compile a stemmer table.
- */
-public class Compile {
-
- static boolean backward;
- static boolean multi;
- static Trie trie;
-
- /**
- * Entry point to the Compile application.
- * <p>
- * This program takes any number of arguments: the first is the name of the
- * desired stemming algorithm to use (a list is available in the package
- * description) , all of the rest should be the path or paths to a file or
- * files containing a stemmer table to compile.
- *
- * @param args the command line arguments
- */
- public static void main(java.lang.String[] args) {
- if (args.length < 1) {
- return;
- }
-
- args[0].toUpperCase();
-
- backward = args[0].charAt(0) == '-';
- int qq = (backward) ? 1 : 0;
- boolean storeorig = false;
-
- if (args[0].charAt(qq) == '0') {
- storeorig = true;
- qq++;
- }
-
- multi = args[0].charAt(qq) == 'M';
- if (multi) {
- qq++;
- }
-
- String charset = System.getProperty("egothor.stemmer.charset", "UTF-8");
-
- char optimizer[] = new char[args[0].length() - qq];
- for (int i = 0; i < optimizer.length; i++) {
- optimizer[i] = args[0].charAt(qq + i);
- }
-
- for (int i = 1; i < args.length; i++) {
- LineNumberReader in;
- // System.out.println("[" + args[i] + "]");
- Diff diff = new Diff();
- try {
- int stems = 0;
- int words = 0;
-
- allocTrie();
-
- System.out.println(args[i]);
- in = new LineNumberReader(new BufferedReader(new InputStreamReader(
- new FileInputStream(args[i]), charset)));
- for (String line = in.readLine(); line != null; line = in.readLine()) {
- try {
- line = line.toLowerCase();
- StringTokenizer st = new StringTokenizer(line);
- String stem = st.nextToken();
- if (storeorig) {
- trie.add(stem, "-a");
- words++;
- }
- while (st.hasMoreTokens()) {
- String token = st.nextToken();
- if (token.equals(stem) == false) {
- trie.add(token, diff.exec(token, stem));
- words++;
- }
- }
- } catch (java.util.NoSuchElementException x) {
- // no base token (stem) on a line
- }
- }
-
- Optimizer o = new Optimizer();
- Optimizer2 o2 = new Optimizer2();
- Lift l = new Lift(true);
- Lift e = new Lift(false);
- Gener g = new Gener();
-
- for (int j = 0; j < optimizer.length; j++) {
- String prefix;
- switch (optimizer[j]) {
- case 'G':
- trie = trie.reduce(g);
- prefix = "G: ";
- break;
- case 'L':
- trie = trie.reduce(l);
- prefix = "L: ";
- break;
- case 'E':
- trie = trie.reduce(e);
- prefix = "E: ";
- break;
- case '2':
- trie = trie.reduce(o2);
- prefix = "2: ";
- break;
- case '1':
- trie = trie.reduce(o);
- prefix = "1: ";
- break;
- default:
- continue;
- }
- trie.printInfo(prefix + " ");
- }
-
- DataOutputStream os = new DataOutputStream(new BufferedOutputStream(
- new FileOutputStream(args[i] + ".out")));
- os.writeUTF(args[0]);
- trie.store(os);
- os.close();
-
- } catch (FileNotFoundException x) {
- x.printStackTrace();
- } catch (IOException x) {
- x.printStackTrace();
- }
- }
- }
-
- static void allocTrie() {
- if (multi) {
- trie = new MultiTrie2(!backward);
- } else {
- trie = new Trie(!backward);
- }
- }
-}