X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Compile.java?ds=sidebyside diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Compile.java b/lucene-java-3.5.0/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Compile.java new file mode 100644 index 0000000..7b522e5 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Compile.java @@ -0,0 +1,205 @@ +/* + Egothor Software License version 1.00 + Copyright (C) 1997-2004 Leo Galambos. + Copyright (C) 2002-2004 "Egothor developers" + on behalf of the Egothor Project. + All rights reserved. + + This software is copyrighted by the "Egothor developers". If this + license applies to a single file or document, the "Egothor developers" + are the people or entities mentioned as copyright holders in that file + or document. If this license applies to the Egothor project as a + whole, the copyright holders are the people or entities mentioned in + the file CREDITS. This file can be found in the same location as this + license in the distribution. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + 1. Redistributions of source code must retain the above copyright + notice, the list of contributors, this list of conditions, and the + following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, the list of contributors, this list of conditions, and the + disclaimer that follows these conditions in the documentation + and/or other materials provided with the distribution. + 3. The name "Egothor" must not be used to endorse or promote products + derived from this software without prior written permission. For + written permission, please contact Leo.G@seznam.cz + 4. Products derived from this software may not be called "Egothor", + nor may "Egothor" appear in their name, without prior written + permission from Leo.G@seznam.cz. + + In addition, we request that you include in the end-user documentation + provided with the redistribution and/or in the software itself an + acknowledgement equivalent to the following: + "This product includes software developed by the Egothor Project. + http://egothor.sf.net/" + + THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + This software consists of voluntary contributions made by many + individuals on behalf of the Egothor Project and was originally + created by Leo Galambos (Leo.G@seznam.cz). + */ +package org.egothor.stemmer; + +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.DataOutputStream; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.LineNumberReader; +import java.util.StringTokenizer; + +/** + * The Compile class is used to compile a stemmer table. + */ +public class Compile { + + static boolean backward; + static boolean multi; + static Trie trie; + + /** + * Entry point to the Compile application. + *

+ * This program takes any number of arguments: the first is the name of the + * desired stemming algorithm to use (a list is available in the package + * description) , all of the rest should be the path or paths to a file or + * files containing a stemmer table to compile. + * + * @param args the command line arguments + */ + public static void main(java.lang.String[] args) { + if (args.length < 1) { + return; + } + + args[0].toUpperCase(); + + backward = args[0].charAt(0) == '-'; + int qq = (backward) ? 1 : 0; + boolean storeorig = false; + + if (args[0].charAt(qq) == '0') { + storeorig = true; + qq++; + } + + multi = args[0].charAt(qq) == 'M'; + if (multi) { + qq++; + } + + String charset = System.getProperty("egothor.stemmer.charset", "UTF-8"); + + char optimizer[] = new char[args[0].length() - qq]; + for (int i = 0; i < optimizer.length; i++) { + optimizer[i] = args[0].charAt(qq + i); + } + + for (int i = 1; i < args.length; i++) { + LineNumberReader in; + // System.out.println("[" + args[i] + "]"); + Diff diff = new Diff(); + try { + int stems = 0; + int words = 0; + + allocTrie(); + + System.out.println(args[i]); + in = new LineNumberReader(new BufferedReader(new InputStreamReader( + new FileInputStream(args[i]), charset))); + for (String line = in.readLine(); line != null; line = in.readLine()) { + try { + line = line.toLowerCase(); + StringTokenizer st = new StringTokenizer(line); + String stem = st.nextToken(); + if (storeorig) { + trie.add(stem, "-a"); + words++; + } + while (st.hasMoreTokens()) { + String token = st.nextToken(); + if (token.equals(stem) == false) { + trie.add(token, diff.exec(token, stem)); + words++; + } + } + } catch (java.util.NoSuchElementException x) { + // no base token (stem) on a line + } + } + + Optimizer o = new Optimizer(); + Optimizer2 o2 = new Optimizer2(); + Lift l = new Lift(true); + Lift e = new Lift(false); + Gener g = new Gener(); + + for (int j = 0; j < optimizer.length; j++) { + String prefix; + switch (optimizer[j]) { + case 'G': + trie = trie.reduce(g); + prefix = "G: "; + break; + case 'L': + trie = trie.reduce(l); + prefix = "L: "; + break; + case 'E': + trie = trie.reduce(e); + prefix = "E: "; + break; + case '2': + trie = trie.reduce(o2); + prefix = "2: "; + break; + case '1': + trie = trie.reduce(o); + prefix = "1: "; + break; + default: + continue; + } + trie.printInfo(prefix + " "); + } + + DataOutputStream os = new DataOutputStream(new BufferedOutputStream( + new FileOutputStream(args[i] + ".out"))); + os.writeUTF(args[0]); + trie.store(os); + os.close(); + + } catch (FileNotFoundException x) { + x.printStackTrace(); + } catch (IOException x) { + x.printStackTrace(); + } + } + } + + static void allocTrie() { + if (multi) { + trie = new MultiTrie2(!backward); + } else { + trie = new Trie(!backward); + } + } +}