pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / analyzers / common / src / java / org / apache / lucene / analysis / fr / FrenchStemmer.java
diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java

new file mode 100644 (file)

index 0000000..a42382f
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java
@@ -0,0 +1,712 @@
+package org.apache.lucene.analysis.fr;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A stemmer for French words. 
+ * <p>
+ * The algorithm is based on the work of
+ * Dr Martin Porter on his snowball project<br>
+ * refer to http://snowball.sourceforge.net/french/stemmer.html<br>
+ * (French stemming algorithm) for details
+ * </p>
+ * @deprecated Use {@link org.tartarus.snowball.ext.FrenchStemmer} instead, 
+ * which has the same functionality. This filter will be removed in Lucene 5.0
+ */
+@Deprecated
+public class FrenchStemmer {
+
+    /**
+     * Buffer for the terms while stemming them.
+     */
+    private StringBuilder sb = new StringBuilder();
+
+    /**
+     * A temporary buffer, used to reconstruct R2
+     */
+     private StringBuilder tb = new StringBuilder();
+
+       /**
+        * Region R0 is equal to the whole buffer
+        */
+       private String R0;
+
+       /**
+        * Region RV
+        * "If the word begins with two vowels, RV is the region after the third letter,
+        * otherwise the region after the first vowel not at the beginning of the word,
+        * or the end of the word if these positions cannot be found."
+        */
+    private String RV;
+
+       /**
+        * Region R1
+        * "R1 is the region after the first non-vowel following a vowel
+        * or is the null region at the end of the word if there is no such non-vowel"
+        */
+    private String R1;
+
+       /**
+        * Region R2
+        * "R2 is the region after the first non-vowel in R1 following a vowel
+        * or is the null region at the end of the word if there is no such non-vowel"
+        */
+    private String R2;
+
+
+       /**
+        * Set to true if we need to perform step 2
+        */
+    private boolean suite;
+
+       /**
+        * Set to true if the buffer was modified
+        */
+    private boolean modified;
+
+
+    /**
+     * Stems the given term to a unique <tt>discriminator</tt>.
+     *
+     * @param term  java.langString The term that should be stemmed
+     * @return java.lang.String  Discriminator for <tt>term</tt>
+     */
+    protected String stem( String term ) {
+               if ( !isStemmable( term ) ) {
+                       return term;
+               }
+
+               // Use lowercase for medium stemming.
+               term = term.toLowerCase();
+
+               // Reset the StringBuilder.
+               sb.delete( 0, sb.length() );
+               sb.insert( 0, term );
+
+               // reset the booleans
+               modified = false;
+               suite = false;
+
+               sb = treatVowels( sb );
+
+               setStrings();
+
+               step1();
+
+               if (!modified || suite)
+               {
+                       if (RV != null)
+                       {
+                               suite = step2a();
+                               if (!suite)
+                                       step2b();
+                       }
+               }
+
+               if (modified || suite)
+                       step3();
+               else
+                       step4();
+
+               step5();
+
+               step6();
+
+               return sb.toString();
+    }
+
+       /**
+        * Sets the search region Strings<br>
+        * it needs to be done each time the buffer was modified
+        */
+       private void setStrings() {
+               // set the strings
+               R0 = sb.toString();
+               RV = retrieveRV( sb );
+               R1 = retrieveR( sb );
+               if ( R1 != null )
+               {
+                       tb.delete( 0, tb.length() );
+                       tb.insert( 0, R1 );
+                       R2 = retrieveR( tb );
+               }
+               else
+                       R2 = null;
+       }
+
+       /**
+        * First step of the Porter Algorithm<br>
+        * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
+        */
+       private void step1( ) {
+               String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
+               deleteFrom( R2, suffix );
+
+               replaceFrom( R2, new String[] { "logies", "logie" }, "log" );
+               replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
+               replaceFrom( R2, new String[] { "ences", "ence" }, "ent" );
+
+               String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
+               deleteButSuffixFromElseReplace( R2, search, "ic",  true, R0, "iqU" );
+
+               deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
+               deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
+               deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
+               deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
+               deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );
+
+               deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
+               deleteFrom( RV, new String[] { "ements", "ement" } );
+
+               deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" );
+               deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" );
+               deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true );
+
+               String[] autre = { "ifs", "ives", "if", "ive" };
+               deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
+               deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
+
+               replaceFrom( R0, new String[] { "eaux" }, "eau" );
+
+               replaceFrom( R1, new String[] { "aux" }, "al" );
+
+               deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );
+
+               deleteFrom( R2, new String[] { "eux" } );
+
+               // if one of the next steps is performed, we will need to perform step2a
+               boolean temp = false;
+               temp = replaceFrom( RV, new String[] { "amment" }, "ant" );
+               if (temp == true)
+                       suite = true;
+               temp = replaceFrom( RV, new String[] { "emment" }, "ent" );
+               if (temp == true)
+                       suite = true;
+               temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
+               if (temp == true)
+                       suite = true;
+
+       }
+
+       /**
+        * Second step (A) of the Porter Algorithm<br>
+        * Will be performed if nothing changed from the first step
+        * or changed were done in the amment, emment, ments or ment suffixes<br>
+        * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
+        *
+        * @return boolean - true if something changed in the StringBuilder
+        */
+       private boolean step2a() {
+               String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira",
+                                                       "irent", "iriez", "irez", "irions", "irons", "iront",
+                                                       "issaIent", "issais", "issantes", "issante", "issants", "issant",
+                                                       "issait", "issais", "issions", "issons", "issiez", "issez", "issent",
+                                                       "isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" };
+               return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
+       }
+
+       /**
+        * Second step (B) of the Porter Algorithm<br>
+        * Will be performed if step 2 A was performed unsuccessfully<br>
+        * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
+        */
+       private void step2b() {
+               String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
+                                                       "erons", "eront","erez", "èrent", "era", "ées", "iez",
+                                                       "ée", "és", "er", "ez", "é" };
+               deleteFrom( RV, suffix );
+
+               String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
+                                                       "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",
+                                                       "ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
+               deleteButSuffixFrom( RV, search, "e", true );
+
+               deleteFrom( R2, new String[] { "ions" } );
+       }
+
+       /**
+        * Third step of the Porter Algorithm<br>
+        * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
+        */
+       private void step3() {
+               if (sb.length()>0)
+               {
+                       char ch = sb.charAt( sb.length()-1 );
+                       if (ch == 'Y')
+                       {
+                               sb.setCharAt( sb.length()-1, 'i' );
+                               setStrings();
+                       }
+                       else if (ch == 'ç')
+                       {
+                               sb.setCharAt( sb.length()-1, 'c' );
+                               setStrings();
+                       }
+               }
+       }
+
+       /**
+        * Fourth step of the Porter Algorithm<br>
+        * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
+        */
+       private void step4() {
+               if (sb.length() > 1)
+               {
+                       char ch = sb.charAt( sb.length()-1 );
+                       if (ch == 's')
+                       {
+                               char b = sb.charAt( sb.length()-2 );
+                               if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')
+                               {
+                                       sb.delete( sb.length() - 1, sb.length());
+                                       setStrings();
+                               }
+                       }
+               }
+               boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
+               if (!found)
+               found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
+
+               replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
+               deleteFrom( RV, new String[] { "e" } );
+               deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
+       }
+
+       /**
+        * Fifth step of the Porter Algorithm<br>
+        * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
+        */
+       private void step5() {
+               if (R0 != null)
+               {
+                       if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill"))
+                       {
+                               sb.delete( sb.length() - 1, sb.length() );
+                               setStrings();
+                       }
+               }
+       }
+
+       /**
+        * Sixth (and last!) step of the Porter Algorithm<br>
+        * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
+        */
+       private void step6() {
+               if (R0!=null && R0.length()>0)
+               {
+                       boolean seenVowel = false;
+                       boolean seenConson = false;
+                       int pos = -1;
+                       for (int i = R0.length()-1; i > -1; i--)
+                       {
+                               char ch = R0.charAt(i);
+                               if (isVowel(ch))
+                               {
+                                       if (!seenVowel)
+                                       {
+                                               if (ch == 'é' || ch == 'è')
+                                               {
+                                                       pos = i;
+                                                       break;
+                                               }
+                                       }
+                                       seenVowel = true;
+                               }
+                               else
+                               {
+                                       if (seenVowel)
+                                               break;
+                                       else
+                                               seenConson = true;
+                               }
+                       }
+                       if (pos > -1 && seenConson && !seenVowel)
+                               sb.setCharAt(pos, 'e');
+               }
+       }
+
+       /**
+        * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
+        *
+        * @param source java.lang.String - the primary source zone for search
+        * @param search java.lang.String[] - the strings to search for suppression
+        * @param from java.lang.String - the secondary source zone for search
+        * @param prefix java.lang.String - the prefix to add to the search string to test
+        * @return boolean - true if modified
+        */
+       private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) {
+               boolean found = false;
+               if (source!=null )
+               {
+                       for (int i = 0; i < search.length; i++) {
+                               if ( source.endsWith( search[i] ))
+                               {
+                                       if (from!=null && from.endsWith( prefix + search[i] ))
+                                       {
+                                               sb.delete( sb.length() - search[i].length(), sb.length());
+                                               found = true;
+                                               setStrings();
+                                               break;
+                                       }
+                               }
+                       }
+               }
+               return found;
+       }
+
+       /**
+        * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
+        *
+        * @param source java.lang.String - the primary source zone for search
+        * @param search java.lang.String[] - the strings to search for suppression
+        * @param vowel boolean - true if we need a vowel before the search string
+        * @param from java.lang.String - the secondary source zone for search (where vowel could be)
+        * @return boolean - true if modified
+        */
+       private boolean deleteFromIfTestVowelBeforeIn( String source, String[] search, boolean vowel, String from ) {
+               boolean found = false;
+               if (source!=null && from!=null)
+               {
+                       for (int i = 0; i < search.length; i++) {
+                               if ( source.endsWith( search[i] ))
+                               {
+                                       if ((search[i].length() + 1) <= from.length())
+                                       {
+                                               boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1)));
+                                               if (test == vowel)
+                                               {
+                                                       sb.delete( sb.length() - search[i].length(), sb.length());
+                                                       modified = true;
+                                                       found = true;
+                                                       setStrings();
+                                                       break;
+                                               }
+                                       }
+                               }
+                       }
+               }
+               return found;
+       }
+
+       /**
+        * Delete a suffix searched in zone "source" if preceded by the prefix
+        *
+        * @param source java.lang.String - the primary source zone for search
+        * @param search java.lang.String[] - the strings to search for suppression
+        * @param prefix java.lang.String - the prefix to add to the search string to test
+        * @param without boolean - true if it will be deleted even without prefix found
+        */
+       private void deleteButSuffixFrom( String source, String[] search, String prefix, boolean without ) {
+               if (source!=null)
+               {
+                       for (int i = 0; i < search.length; i++) {
+                               if ( source.endsWith( prefix + search[i] ))
+                               {
+                                       sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
+                                       modified = true;
+                                       setStrings();
+                                       break;
+                               }
+                               else if ( without && source.endsWith( search[i] ))
+                               {
+                                       sb.delete( sb.length() - search[i].length(), sb.length() );
+                                       modified = true;
+                                       setStrings();
+                                       break;
+                               }
+                       }
+               }
+       }
+
+       /**
+        * Delete a suffix searched in zone "source" if preceded by prefix<br>
+        * or replace it with the replace string if preceded by the prefix in the zone "from"<br>
+        * or delete the suffix if specified
+        *
+        * @param source java.lang.String - the primary source zone for search
+        * @param search java.lang.String[] - the strings to search for suppression
+        * @param prefix java.lang.String - the prefix to add to the search string to test
+        * @param without boolean - true if it will be deleted even without prefix found
+        */
+       private void deleteButSuffixFromElseReplace( String source, String[] search, String prefix, boolean without, String from, String replace ) {
+               if (source!=null)
+               {
+                       for (int i = 0; i < search.length; i++) {
+                               if ( source.endsWith( prefix + search[i] ))
+                               {
+                                       sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
+                                       modified = true;
+                                       setStrings();
+                                       break;
+                               }
+                               else if ( from!=null && from.endsWith( prefix + search[i] ))
+                               {
+                                       sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace );
+                                       modified = true;
+                                       setStrings();
+                                       break;
+                               }
+                               else if ( without && source.endsWith( search[i] ))
+                               {
+                                       sb.delete( sb.length() - search[i].length(), sb.length() );
+                                       modified = true;
+                                       setStrings();
+                                       break;
+                               }
+                       }
+               }
+       }
+
+       /**
+        * Replace a search string with another within the source zone
+        *
+        * @param source java.lang.String - the source zone for search
+        * @param search java.lang.String[] - the strings to search for replacement
+        * @param replace java.lang.String - the replacement string
+        */
+       private boolean replaceFrom( String source, String[] search, String replace ) {
+               boolean found = false;
+               if (source!=null)
+               {
+                       for (int i = 0; i < search.length; i++) {
+                               if ( source.endsWith( search[i] ))
+                               {
+                                       sb.replace( sb.length() - search[i].length(), sb.length(), replace );
+                                       modified = true;
+                                       found = true;
+                                       setStrings();
+                                       break;
+                               }
+                       }
+               }
+               return found;
+       }
+
+       /**
+        * Delete a search string within the source zone
+        *
+        * @param source the source zone for search
+        * @param suffix the strings to search for suppression
+        */
+       private void deleteFrom(String source, String[] suffix ) {
+               if (source!=null)
+               {
+                       for (int i = 0; i < suffix.length; i++) {
+                               if (source.endsWith( suffix[i] ))
+                               {
+                                       sb.delete( sb.length() - suffix[i].length(), sb.length());
+                                       modified = true;
+                                       setStrings();
+                                       break;
+                               }
+                       }
+               }
+       }
+
+       /**
+        * Test if a char is a french vowel, including accentuated ones
+        *
+        * @param ch the char to test
+        * @return boolean - true if the char is a vowel
+        */
+       private boolean isVowel(char ch) {
+               switch (ch)
+               {
+                       case 'a':
+                       case 'e':
+                       case 'i':
+                       case 'o':
+                       case 'u':
+                       case 'y':
+                       case 'â':
+                       case 'à':
+                       case 'ë':
+                       case 'é':
+                       case 'ê':
+                       case 'è':
+                       case 'ï':
+                       case 'î':
+                       case 'ô':
+                       case 'ü':
+                       case 'ù':
+                       case 'û':
+                               return true;
+                       default:
+                               return false;
+               }
+       }
+
+       /**
+        * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
+        * "R is the region after the first non-vowel following a vowel
+        * or is the null region at the end of the word if there is no such non-vowel"<br>
+        * @param buffer java.lang.StringBuilder - the in buffer
+        * @return java.lang.String - the resulting string
+        */
+       private String retrieveR( StringBuilder buffer ) {
+               int len = buffer.length();
+               int pos = -1;
+               for (int c = 0; c < len; c++) {
+                       if (isVowel( buffer.charAt( c )))
+                       {
+                               pos = c;
+                               break;
+                       }
+               }
+               if (pos > -1)
+               {
+                       int consonne = -1;
+                       for (int c = pos; c < len; c++) {
+                               if (!isVowel(buffer.charAt( c )))
+                               {
+                                       consonne = c;
+                                       break;
+                               }
+                       }
+                       if (consonne > -1 && (consonne+1) < len)
+                               return buffer.substring( consonne+1, len );
+                       else
+                               return null;
+               }
+               else
+                       return null;
+       }
+
+       /**
+        * Retrieve the "RV zone" from a buffer an return the corresponding string<br>
+        * "If the word begins with two vowels, RV is the region after the third letter,
+        * otherwise the region after the first vowel not at the beginning of the word,
+        * or the end of the word if these positions cannot be found."<br>
+        * @param buffer java.lang.StringBuilder - the in buffer
+        * @return java.lang.String - the resulting string
+        */
+       private String retrieveRV( StringBuilder buffer ) {
+               int len = buffer.length();
+               if ( buffer.length() > 3)
+               {
+                       if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) {
+                               return buffer.substring(3,len);
+                       }
+                       else
+                       {
+                               int pos = 0;
+                               for (int c = 1; c < len; c++) {
+                                       if (isVowel( buffer.charAt( c )))
+                                       {
+                                               pos = c;
+                                               break;
+                                       }
+                               }
+                               if ( pos+1 < len )
+                                       return buffer.substring( pos+1, len );
+                               else
+                                       return null;
+                       }
+               }
+               else
+                       return null;
+       }
+
+
+
+    /**
+        * Turns u and i preceded AND followed by a vowel to UpperCase<br>
+        * Turns y preceded OR followed by a vowel to UpperCase<br>
+        * Turns u preceded by q to UpperCase<br>
+     *
+     * @param buffer java.util.StringBuilder - the buffer to treat
+     * @return java.util.StringBuilder - the treated buffer
+     */
+    private StringBuilder treatVowels( StringBuilder buffer ) {
+               for ( int c = 0; c < buffer.length(); c++ ) {
+                       char ch = buffer.charAt( c );
+
+                       if (c == 0) // first char
+                       {
+                               if (buffer.length()>1)
+                               {
+                                       if (ch == 'y' && isVowel(buffer.charAt( c + 1 )))
+                                               buffer.setCharAt( c, 'Y' );
+                               }
+                       }
+                       else if (c == buffer.length()-1) // last char
+                       {
+                               if (ch == 'u' && buffer.charAt( c - 1 ) == 'q')
+                                       buffer.setCharAt( c, 'U' );
+                               if (ch == 'y' && isVowel(buffer.charAt( c - 1 )))
+                                       buffer.setCharAt( c, 'Y' );
+                       }
+                       else // other cases
+                       {
+                               if (ch == 'u')
+                               {
+                                       if (buffer.charAt( c - 1) == 'q')
+                                               buffer.setCharAt( c, 'U' );
+                                       else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
+                                               buffer.setCharAt( c, 'U' );
+                               }
+                               if (ch == 'i')
+                               {
+                                       if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
+                                               buffer.setCharAt( c, 'I' );
+                               }
+                               if (ch == 'y')
+                               {
+                                       if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 )))
+                                               buffer.setCharAt( c, 'Y' );
+                               }
+                       }
+               }
+
+               return buffer;
+    }
+
+    /**
+     * Checks a term if it can be processed correctly.
+     *
+     * @return boolean - true if, and only if, the given term consists in letters.
+     */
+    private boolean isStemmable( String term ) {
+               boolean upper = false;
+               int first = -1;
+               for ( int c = 0; c < term.length(); c++ ) {
+                       // Discard terms that contain non-letter characters.
+                       if ( !Character.isLetter( term.charAt( c ) ) ) {
+                               return false;
+                       }
+                       // Discard terms that contain multiple uppercase letters.
+                       if ( Character.isUpperCase( term.charAt( c ) ) ) {
+                               if ( upper ) {
+                                       return false;
+                               }
+                       // First encountered uppercase letter, set flag and save
+                       // position.
+                               else {
+                                       first = c;
+                                       upper = true;
+                               }
+                       }
+               }
+               // Discard the term if it contains a single uppercase letter that
+               // is not starting the term.
+               if ( first > 0 ) {
+                       return false;
+               }
+               return true;
+    }
+}