pylucene 3.5.0-3
[pylucene.git] / lucene-java-3.4.0 / lucene / contrib / analyzers / common / src / java / org / apache / lucene / analysis / fr / FrenchStemmer.java
diff --git a/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java b/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java
deleted file mode 100644 (file)
index a42382f..0000000
+++ /dev/null
@@ -1,712 +0,0 @@
-package org.apache.lucene.analysis.fr;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * A stemmer for French words. 
- * <p>
- * The algorithm is based on the work of
- * Dr Martin Porter on his snowball project<br>
- * refer to http://snowball.sourceforge.net/french/stemmer.html<br>
- * (French stemming algorithm) for details
- * </p>
- * @deprecated Use {@link org.tartarus.snowball.ext.FrenchStemmer} instead, 
- * which has the same functionality. This filter will be removed in Lucene 5.0
- */
-@Deprecated
-public class FrenchStemmer {
-
-    /**
-     * Buffer for the terms while stemming them.
-     */
-    private StringBuilder sb = new StringBuilder();
-
-    /**
-     * A temporary buffer, used to reconstruct R2
-     */
-     private StringBuilder tb = new StringBuilder();
-
-       /**
-        * Region R0 is equal to the whole buffer
-        */
-       private String R0;
-
-       /**
-        * Region RV
-        * "If the word begins with two vowels, RV is the region after the third letter,
-        * otherwise the region after the first vowel not at the beginning of the word,
-        * or the end of the word if these positions cannot be found."
-        */
-    private String RV;
-
-       /**
-        * Region R1
-        * "R1 is the region after the first non-vowel following a vowel
-        * or is the null region at the end of the word if there is no such non-vowel"
-        */
-    private String R1;
-
-       /**
-        * Region R2
-        * "R2 is the region after the first non-vowel in R1 following a vowel
-        * or is the null region at the end of the word if there is no such non-vowel"
-        */
-    private String R2;
-
-
-       /**
-        * Set to true if we need to perform step 2
-        */
-    private boolean suite;
-
-       /**
-        * Set to true if the buffer was modified
-        */
-    private boolean modified;
-
-
-    /**
-     * Stems the given term to a unique <tt>discriminator</tt>.
-     *
-     * @param term  java.langString The term that should be stemmed
-     * @return java.lang.String  Discriminator for <tt>term</tt>
-     */
-    protected String stem( String term ) {
-               if ( !isStemmable( term ) ) {
-                       return term;
-               }
-
-               // Use lowercase for medium stemming.
-               term = term.toLowerCase();
-
-               // Reset the StringBuilder.
-               sb.delete( 0, sb.length() );
-               sb.insert( 0, term );
-
-               // reset the booleans
-               modified = false;
-               suite = false;
-
-               sb = treatVowels( sb );
-
-               setStrings();
-
-               step1();
-
-               if (!modified || suite)
-               {
-                       if (RV != null)
-                       {
-                               suite = step2a();
-                               if (!suite)
-                                       step2b();
-                       }
-               }
-
-               if (modified || suite)
-                       step3();
-               else
-                       step4();
-
-               step5();
-
-               step6();
-
-               return sb.toString();
-    }
-
-       /**
-        * Sets the search region Strings<br>
-        * it needs to be done each time the buffer was modified
-        */
-       private void setStrings() {
-               // set the strings
-               R0 = sb.toString();
-               RV = retrieveRV( sb );
-               R1 = retrieveR( sb );
-               if ( R1 != null )
-               {
-                       tb.delete( 0, tb.length() );
-                       tb.insert( 0, R1 );
-                       R2 = retrieveR( tb );
-               }
-               else
-                       R2 = null;
-       }
-
-       /**
-        * First step of the Porter Algorithm<br>
-        * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
-        */
-       private void step1( ) {
-               String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
-               deleteFrom( R2, suffix );
-
-               replaceFrom( R2, new String[] { "logies", "logie" }, "log" );
-               replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
-               replaceFrom( R2, new String[] { "ences", "ence" }, "ent" );
-
-               String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
-               deleteButSuffixFromElseReplace( R2, search, "ic",  true, R0, "iqU" );
-
-               deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
-               deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
-               deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
-               deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
-               deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );
-
-               deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
-               deleteFrom( RV, new String[] { "ements", "ement" } );
-
-               deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" );
-               deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" );
-               deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true );
-
-               String[] autre = { "ifs", "ives", "if", "ive" };
-               deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
-               deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
-
-               replaceFrom( R0, new String[] { "eaux" }, "eau" );
-
-               replaceFrom( R1, new String[] { "aux" }, "al" );
-
-               deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );
-
-               deleteFrom( R2, new String[] { "eux" } );
-
-               // if one of the next steps is performed, we will need to perform step2a
-               boolean temp = false;
-               temp = replaceFrom( RV, new String[] { "amment" }, "ant" );
-               if (temp == true)
-                       suite = true;
-               temp = replaceFrom( RV, new String[] { "emment" }, "ent" );
-               if (temp == true)
-                       suite = true;
-               temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
-               if (temp == true)
-                       suite = true;
-
-       }
-
-       /**
-        * Second step (A) of the Porter Algorithm<br>
-        * Will be performed if nothing changed from the first step
-        * or changed were done in the amment, emment, ments or ment suffixes<br>
-        * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
-        *
-        * @return boolean - true if something changed in the StringBuilder
-        */
-       private boolean step2a() {
-               String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira",
-                                                       "irent", "iriez", "irez", "irions", "irons", "iront",
-                                                       "issaIent", "issais", "issantes", "issante", "issants", "issant",
-                                                       "issait", "issais", "issions", "issons", "issiez", "issez", "issent",
-                                                       "isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" };
-               return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
-       }
-
-       /**
-        * Second step (B) of the Porter Algorithm<br>
-        * Will be performed if step 2 A was performed unsuccessfully<br>
-        * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
-        */
-       private void step2b() {
-               String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
-                                                       "erons", "eront","erez", "èrent", "era", "ées", "iez",
-                                                       "ée", "és", "er", "ez", "é" };
-               deleteFrom( RV, suffix );
-
-               String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
-                                                       "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",
-                                                       "ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
-               deleteButSuffixFrom( RV, search, "e", true );
-
-               deleteFrom( R2, new String[] { "ions" } );
-       }
-
-       /**
-        * Third step of the Porter Algorithm<br>
-        * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
-        */
-       private void step3() {
-               if (sb.length()>0)
-               {
-                       char ch = sb.charAt( sb.length()-1 );
-                       if (ch == 'Y')
-                       {
-                               sb.setCharAt( sb.length()-1, 'i' );
-                               setStrings();
-                       }
-                       else if (ch == 'ç')
-                       {
-                               sb.setCharAt( sb.length()-1, 'c' );
-                               setStrings();
-                       }
-               }
-       }
-
-       /**
-        * Fourth step of the Porter Algorithm<br>
-        * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
-        */
-       private void step4() {
-               if (sb.length() > 1)
-               {
-                       char ch = sb.charAt( sb.length()-1 );
-                       if (ch == 's')
-                       {
-                               char b = sb.charAt( sb.length()-2 );
-                               if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')
-                               {
-                                       sb.delete( sb.length() - 1, sb.length());
-                                       setStrings();
-                               }
-                       }
-               }
-               boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
-               if (!found)
-               found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
-
-               replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
-               deleteFrom( RV, new String[] { "e" } );
-               deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
-       }
-
-       /**
-        * Fifth step of the Porter Algorithm<br>
-        * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
-        */
-       private void step5() {
-               if (R0 != null)
-               {
-                       if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill"))
-                       {
-                               sb.delete( sb.length() - 1, sb.length() );
-                               setStrings();
-                       }
-               }
-       }
-
-       /**
-        * Sixth (and last!) step of the Porter Algorithm<br>
-        * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
-        */
-       private void step6() {
-               if (R0!=null && R0.length()>0)
-               {
-                       boolean seenVowel = false;
-                       boolean seenConson = false;
-                       int pos = -1;
-                       for (int i = R0.length()-1; i > -1; i--)
-                       {
-                               char ch = R0.charAt(i);
-                               if (isVowel(ch))
-                               {
-                                       if (!seenVowel)
-                                       {
-                                               if (ch == 'é' || ch == 'è')
-                                               {
-                                                       pos = i;
-                                                       break;
-                                               }
-                                       }
-                                       seenVowel = true;
-                               }
-                               else
-                               {
-                                       if (seenVowel)
-                                               break;
-                                       else
-                                               seenConson = true;
-                               }
-                       }
-                       if (pos > -1 && seenConson && !seenVowel)
-                               sb.setCharAt(pos, 'e');
-               }
-       }
-
-       /**
-        * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
-        *
-        * @param source java.lang.String - the primary source zone for search
-        * @param search java.lang.String[] - the strings to search for suppression
-        * @param from java.lang.String - the secondary source zone for search
-        * @param prefix java.lang.String - the prefix to add to the search string to test
-        * @return boolean - true if modified
-        */
-       private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) {
-               boolean found = false;
-               if (source!=null )
-               {
-                       for (int i = 0; i < search.length; i++) {
-                               if ( source.endsWith( search[i] ))
-                               {
-                                       if (from!=null && from.endsWith( prefix + search[i] ))
-                                       {
-                                               sb.delete( sb.length() - search[i].length(), sb.length());
-                                               found = true;
-                                               setStrings();
-                                               break;
-                                       }
-                               }
-                       }
-               }
-               return found;
-       }
-
-       /**
-        * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
-        *
-        * @param source java.lang.String - the primary source zone for search
-        * @param search java.lang.String[] - the strings to search for suppression
-        * @param vowel boolean - true if we need a vowel before the search string
-        * @param from java.lang.String - the secondary source zone for search (where vowel could be)
-        * @return boolean - true if modified
-        */
-       private boolean deleteFromIfTestVowelBeforeIn( String source, String[] search, boolean vowel, String from ) {
-               boolean found = false;
-               if (source!=null && from!=null)
-               {
-                       for (int i = 0; i < search.length; i++) {
-                               if ( source.endsWith( search[i] ))
-                               {
-                                       if ((search[i].length() + 1) <= from.length())
-                                       {
-                                               boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1)));
-                                               if (test == vowel)
-                                               {
-                                                       sb.delete( sb.length() - search[i].length(), sb.length());
-                                                       modified = true;
-                                                       found = true;
-                                                       setStrings();
-                                                       break;
-                                               }
-                                       }
-                               }
-                       }
-               }
-               return found;
-       }
-
-       /**
-        * Delete a suffix searched in zone "source" if preceded by the prefix
-        *
-        * @param source java.lang.String - the primary source zone for search
-        * @param search java.lang.String[] - the strings to search for suppression
-        * @param prefix java.lang.String - the prefix to add to the search string to test
-        * @param without boolean - true if it will be deleted even without prefix found
-        */
-       private void deleteButSuffixFrom( String source, String[] search, String prefix, boolean without ) {
-               if (source!=null)
-               {
-                       for (int i = 0; i < search.length; i++) {
-                               if ( source.endsWith( prefix + search[i] ))
-                               {
-                                       sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
-                                       modified = true;
-                                       setStrings();
-                                       break;
-                               }
-                               else if ( without && source.endsWith( search[i] ))
-                               {
-                                       sb.delete( sb.length() - search[i].length(), sb.length() );
-                                       modified = true;
-                                       setStrings();
-                                       break;
-                               }
-                       }
-               }
-       }
-
-       /**
-        * Delete a suffix searched in zone "source" if preceded by prefix<br>
-        * or replace it with the replace string if preceded by the prefix in the zone "from"<br>
-        * or delete the suffix if specified
-        *
-        * @param source java.lang.String - the primary source zone for search
-        * @param search java.lang.String[] - the strings to search for suppression
-        * @param prefix java.lang.String - the prefix to add to the search string to test
-        * @param without boolean - true if it will be deleted even without prefix found
-        */
-       private void deleteButSuffixFromElseReplace( String source, String[] search, String prefix, boolean without, String from, String replace ) {
-               if (source!=null)
-               {
-                       for (int i = 0; i < search.length; i++) {
-                               if ( source.endsWith( prefix + search[i] ))
-                               {
-                                       sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
-                                       modified = true;
-                                       setStrings();
-                                       break;
-                               }
-                               else if ( from!=null && from.endsWith( prefix + search[i] ))
-                               {
-                                       sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace );
-                                       modified = true;
-                                       setStrings();
-                                       break;
-                               }
-                               else if ( without && source.endsWith( search[i] ))
-                               {
-                                       sb.delete( sb.length() - search[i].length(), sb.length() );
-                                       modified = true;
-                                       setStrings();
-                                       break;
-                               }
-                       }
-               }
-       }
-
-       /**
-        * Replace a search string with another within the source zone
-        *
-        * @param source java.lang.String - the source zone for search
-        * @param search java.lang.String[] - the strings to search for replacement
-        * @param replace java.lang.String - the replacement string
-        */
-       private boolean replaceFrom( String source, String[] search, String replace ) {
-               boolean found = false;
-               if (source!=null)
-               {
-                       for (int i = 0; i < search.length; i++) {
-                               if ( source.endsWith( search[i] ))
-                               {
-                                       sb.replace( sb.length() - search[i].length(), sb.length(), replace );
-                                       modified = true;
-                                       found = true;
-                                       setStrings();
-                                       break;
-                               }
-                       }
-               }
-               return found;
-       }
-
-       /**
-        * Delete a search string within the source zone
-        *
-        * @param source the source zone for search
-        * @param suffix the strings to search for suppression
-        */
-       private void deleteFrom(String source, String[] suffix ) {
-               if (source!=null)
-               {
-                       for (int i = 0; i < suffix.length; i++) {
-                               if (source.endsWith( suffix[i] ))
-                               {
-                                       sb.delete( sb.length() - suffix[i].length(), sb.length());
-                                       modified = true;
-                                       setStrings();
-                                       break;
-                               }
-                       }
-               }
-       }
-
-       /**
-        * Test if a char is a french vowel, including accentuated ones
-        *
-        * @param ch the char to test
-        * @return boolean - true if the char is a vowel
-        */
-       private boolean isVowel(char ch) {
-               switch (ch)
-               {
-                       case 'a':
-                       case 'e':
-                       case 'i':
-                       case 'o':
-                       case 'u':
-                       case 'y':
-                       case 'â':
-                       case 'à':
-                       case 'ë':
-                       case 'é':
-                       case 'ê':
-                       case 'è':
-                       case 'ï':
-                       case 'î':
-                       case 'ô':
-                       case 'ü':
-                       case 'ù':
-                       case 'û':
-                               return true;
-                       default:
-                               return false;
-               }
-       }
-
-       /**
-        * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
-        * "R is the region after the first non-vowel following a vowel
-        * or is the null region at the end of the word if there is no such non-vowel"<br>
-        * @param buffer java.lang.StringBuilder - the in buffer
-        * @return java.lang.String - the resulting string
-        */
-       private String retrieveR( StringBuilder buffer ) {
-               int len = buffer.length();
-               int pos = -1;
-               for (int c = 0; c < len; c++) {
-                       if (isVowel( buffer.charAt( c )))
-                       {
-                               pos = c;
-                               break;
-                       }
-               }
-               if (pos > -1)
-               {
-                       int consonne = -1;
-                       for (int c = pos; c < len; c++) {
-                               if (!isVowel(buffer.charAt( c )))
-                               {
-                                       consonne = c;
-                                       break;
-                               }
-                       }
-                       if (consonne > -1 && (consonne+1) < len)
-                               return buffer.substring( consonne+1, len );
-                       else
-                               return null;
-               }
-               else
-                       return null;
-       }
-
-       /**
-        * Retrieve the "RV zone" from a buffer an return the corresponding string<br>
-        * "If the word begins with two vowels, RV is the region after the third letter,
-        * otherwise the region after the first vowel not at the beginning of the word,
-        * or the end of the word if these positions cannot be found."<br>
-        * @param buffer java.lang.StringBuilder - the in buffer
-        * @return java.lang.String - the resulting string
-        */
-       private String retrieveRV( StringBuilder buffer ) {
-               int len = buffer.length();
-               if ( buffer.length() > 3)
-               {
-                       if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) {
-                               return buffer.substring(3,len);
-                       }
-                       else
-                       {
-                               int pos = 0;
-                               for (int c = 1; c < len; c++) {
-                                       if (isVowel( buffer.charAt( c )))
-                                       {
-                                               pos = c;
-                                               break;
-                                       }
-                               }
-                               if ( pos+1 < len )
-                                       return buffer.substring( pos+1, len );
-                               else
-                                       return null;
-                       }
-               }
-               else
-                       return null;
-       }
-
-
-
-    /**
-        * Turns u and i preceded AND followed by a vowel to UpperCase<br>
-        * Turns y preceded OR followed by a vowel to UpperCase<br>
-        * Turns u preceded by q to UpperCase<br>
-     *
-     * @param buffer java.util.StringBuilder - the buffer to treat
-     * @return java.util.StringBuilder - the treated buffer
-     */
-    private StringBuilder treatVowels( StringBuilder buffer ) {
-               for ( int c = 0; c < buffer.length(); c++ ) {
-                       char ch = buffer.charAt( c );
-
-                       if (c == 0) // first char
-                       {
-                               if (buffer.length()>1)
-                               {
-                                       if (ch == 'y' && isVowel(buffer.charAt( c + 1 )))
-                                               buffer.setCharAt( c, 'Y' );
-                               }
-                       }
-                       else if (c == buffer.length()-1) // last char
-                       {
-                               if (ch == 'u' && buffer.charAt( c - 1 ) == 'q')
-                                       buffer.setCharAt( c, 'U' );
-                               if (ch == 'y' && isVowel(buffer.charAt( c - 1 )))
-                                       buffer.setCharAt( c, 'Y' );
-                       }
-                       else // other cases
-                       {
-                               if (ch == 'u')
-                               {
-                                       if (buffer.charAt( c - 1) == 'q')
-                                               buffer.setCharAt( c, 'U' );
-                                       else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
-                                               buffer.setCharAt( c, 'U' );
-                               }
-                               if (ch == 'i')
-                               {
-                                       if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
-                                               buffer.setCharAt( c, 'I' );
-                               }
-                               if (ch == 'y')
-                               {
-                                       if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 )))
-                                               buffer.setCharAt( c, 'Y' );
-                               }
-                       }
-               }
-
-               return buffer;
-    }
-
-    /**
-     * Checks a term if it can be processed correctly.
-     *
-     * @return boolean - true if, and only if, the given term consists in letters.
-     */
-    private boolean isStemmable( String term ) {
-               boolean upper = false;
-               int first = -1;
-               for ( int c = 0; c < term.length(); c++ ) {
-                       // Discard terms that contain non-letter characters.
-                       if ( !Character.isLetter( term.charAt( c ) ) ) {
-                               return false;
-                       }
-                       // Discard terms that contain multiple uppercase letters.
-                       if ( Character.isUpperCase( term.charAt( c ) ) ) {
-                               if ( upper ) {
-                                       return false;
-                               }
-                       // First encountered uppercase letter, set flag and save
-                       // position.
-                               else {
-                                       first = c;
-                                       upper = true;
-                               }
-                       }
-               }
-               // Discard the term if it contains a single uppercase letter that
-               // is not starting the term.
-               if ( first > 0 ) {
-                       return false;
-               }
-               return true;
-    }
-}