add --shared
[pylucene.git] / lucene-java-3.4.0 / lucene / contrib / analyzers / common / src / java / org / apache / lucene / analysis / fr / FrenchStemmer.java
1 package org.apache.lucene.analysis.fr;
2
3 /**
4  * Licensed to the Apache Software Foundation (ASF) under one or more
5  * contributor license agreements.  See the NOTICE file distributed with
6  * this work for additional information regarding copyright ownership.
7  * The ASF licenses this file to You under the Apache License, Version 2.0
8  * (the "License"); you may not use this file except in compliance with
9  * the License.  You may obtain a copy of the License at
10  *
11  *     http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19
20 /**
21  * A stemmer for French words. 
22  * <p>
23  * The algorithm is based on the work of
24  * Dr Martin Porter on his snowball project<br>
25  * refer to http://snowball.sourceforge.net/french/stemmer.html<br>
26  * (French stemming algorithm) for details
27  * </p>
28  * @deprecated Use {@link org.tartarus.snowball.ext.FrenchStemmer} instead, 
29  * which has the same functionality. This filter will be removed in Lucene 5.0
30  */
31 @Deprecated
32 public class FrenchStemmer {
33
34     /**
35      * Buffer for the terms while stemming them.
36      */
37     private StringBuilder sb = new StringBuilder();
38
39     /**
40      * A temporary buffer, used to reconstruct R2
41      */
42      private StringBuilder tb = new StringBuilder();
43
44         /**
45          * Region R0 is equal to the whole buffer
46          */
47         private String R0;
48
49         /**
50          * Region RV
51          * "If the word begins with two vowels, RV is the region after the third letter,
52          * otherwise the region after the first vowel not at the beginning of the word,
53          * or the end of the word if these positions cannot be found."
54          */
55     private String RV;
56
57         /**
58          * Region R1
59          * "R1 is the region after the first non-vowel following a vowel
60          * or is the null region at the end of the word if there is no such non-vowel"
61          */
62     private String R1;
63
64         /**
65          * Region R2
66          * "R2 is the region after the first non-vowel in R1 following a vowel
67          * or is the null region at the end of the word if there is no such non-vowel"
68          */
69     private String R2;
70
71
72         /**
73          * Set to true if we need to perform step 2
74          */
75     private boolean suite;
76
77         /**
78          * Set to true if the buffer was modified
79          */
80     private boolean modified;
81
82
83     /**
84      * Stems the given term to a unique <tt>discriminator</tt>.
85      *
86      * @param term  java.langString The term that should be stemmed
87      * @return java.lang.String  Discriminator for <tt>term</tt>
88      */
89     protected String stem( String term ) {
90                 if ( !isStemmable( term ) ) {
91                         return term;
92                 }
93
94                 // Use lowercase for medium stemming.
95                 term = term.toLowerCase();
96
97                 // Reset the StringBuilder.
98                 sb.delete( 0, sb.length() );
99                 sb.insert( 0, term );
100
101                 // reset the booleans
102                 modified = false;
103                 suite = false;
104
105                 sb = treatVowels( sb );
106
107                 setStrings();
108
109                 step1();
110
111                 if (!modified || suite)
112                 {
113                         if (RV != null)
114                         {
115                                 suite = step2a();
116                                 if (!suite)
117                                         step2b();
118                         }
119                 }
120
121                 if (modified || suite)
122                         step3();
123                 else
124                         step4();
125
126                 step5();
127
128                 step6();
129
130                 return sb.toString();
131     }
132
133         /**
134          * Sets the search region Strings<br>
135          * it needs to be done each time the buffer was modified
136          */
137         private void setStrings() {
138                 // set the strings
139                 R0 = sb.toString();
140                 RV = retrieveRV( sb );
141                 R1 = retrieveR( sb );
142                 if ( R1 != null )
143                 {
144                         tb.delete( 0, tb.length() );
145                         tb.insert( 0, R1 );
146                         R2 = retrieveR( tb );
147                 }
148                 else
149                         R2 = null;
150         }
151
152         /**
153          * First step of the Porter Algorithm<br>
154          * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
155          */
156         private void step1( ) {
157                 String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
158                 deleteFrom( R2, suffix );
159
160                 replaceFrom( R2, new String[] { "logies", "logie" }, "log" );
161                 replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
162                 replaceFrom( R2, new String[] { "ences", "ence" }, "ent" );
163
164                 String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
165                 deleteButSuffixFromElseReplace( R2, search, "ic",  true, R0, "iqU" );
166
167                 deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
168                 deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
169                 deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
170                 deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
171                 deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );
172
173                 deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
174                 deleteFrom( RV, new String[] { "ements", "ement" } );
175
176                 deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" );
177                 deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" );
178                 deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true );
179
180                 String[] autre = { "ifs", "ives", "if", "ive" };
181                 deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
182                 deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
183
184                 replaceFrom( R0, new String[] { "eaux" }, "eau" );
185
186                 replaceFrom( R1, new String[] { "aux" }, "al" );
187
188                 deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );
189
190                 deleteFrom( R2, new String[] { "eux" } );
191
192                 // if one of the next steps is performed, we will need to perform step2a
193                 boolean temp = false;
194                 temp = replaceFrom( RV, new String[] { "amment" }, "ant" );
195                 if (temp == true)
196                         suite = true;
197                 temp = replaceFrom( RV, new String[] { "emment" }, "ent" );
198                 if (temp == true)
199                         suite = true;
200                 temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
201                 if (temp == true)
202                         suite = true;
203
204         }
205
206         /**
207          * Second step (A) of the Porter Algorithm<br>
208          * Will be performed if nothing changed from the first step
209          * or changed were done in the amment, emment, ments or ment suffixes<br>
210          * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
211          *
212          * @return boolean - true if something changed in the StringBuilder
213          */
214         private boolean step2a() {
215                 String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira",
216                                                         "irent", "iriez", "irez", "irions", "irons", "iront",
217                                                         "issaIent", "issais", "issantes", "issante", "issants", "issant",
218                                                         "issait", "issais", "issions", "issons", "issiez", "issez", "issent",
219                                                         "isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" };
220                 return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
221         }
222
223         /**
224          * Second step (B) of the Porter Algorithm<br>
225          * Will be performed if step 2 A was performed unsuccessfully<br>
226          * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
227          */
228         private void step2b() {
229                 String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
230                                                         "erons", "eront","erez", "èrent", "era", "ées", "iez",
231                                                         "ée", "és", "er", "ez", "é" };
232                 deleteFrom( RV, suffix );
233
234                 String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
235                                                         "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",
236                                                         "ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
237                 deleteButSuffixFrom( RV, search, "e", true );
238
239                 deleteFrom( R2, new String[] { "ions" } );
240         }
241
242         /**
243          * Third step of the Porter Algorithm<br>
244          * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
245          */
246         private void step3() {
247                 if (sb.length()>0)
248                 {
249                         char ch = sb.charAt( sb.length()-1 );
250                         if (ch == 'Y')
251                         {
252                                 sb.setCharAt( sb.length()-1, 'i' );
253                                 setStrings();
254                         }
255                         else if (ch == 'ç')
256                         {
257                                 sb.setCharAt( sb.length()-1, 'c' );
258                                 setStrings();
259                         }
260                 }
261         }
262
263         /**
264          * Fourth step of the Porter Algorithm<br>
265          * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
266          */
267         private void step4() {
268                 if (sb.length() > 1)
269                 {
270                         char ch = sb.charAt( sb.length()-1 );
271                         if (ch == 's')
272                         {
273                                 char b = sb.charAt( sb.length()-2 );
274                                 if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')
275                                 {
276                                         sb.delete( sb.length() - 1, sb.length());
277                                         setStrings();
278                                 }
279                         }
280                 }
281                 boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
282                 if (!found)
283                 found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
284
285                 replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
286                 deleteFrom( RV, new String[] { "e" } );
287                 deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
288         }
289
290         /**
291          * Fifth step of the Porter Algorithm<br>
292          * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
293          */
294         private void step5() {
295                 if (R0 != null)
296                 {
297                         if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill"))
298                         {
299                                 sb.delete( sb.length() - 1, sb.length() );
300                                 setStrings();
301                         }
302                 }
303         }
304
305         /**
306          * Sixth (and last!) step of the Porter Algorithm<br>
307          * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
308          */
309         private void step6() {
310                 if (R0!=null && R0.length()>0)
311                 {
312                         boolean seenVowel = false;
313                         boolean seenConson = false;
314                         int pos = -1;
315                         for (int i = R0.length()-1; i > -1; i--)
316                         {
317                                 char ch = R0.charAt(i);
318                                 if (isVowel(ch))
319                                 {
320                                         if (!seenVowel)
321                                         {
322                                                 if (ch == 'é' || ch == 'è')
323                                                 {
324                                                         pos = i;
325                                                         break;
326                                                 }
327                                         }
328                                         seenVowel = true;
329                                 }
330                                 else
331                                 {
332                                         if (seenVowel)
333                                                 break;
334                                         else
335                                                 seenConson = true;
336                                 }
337                         }
338                         if (pos > -1 && seenConson && !seenVowel)
339                                 sb.setCharAt(pos, 'e');
340                 }
341         }
342
343         /**
344          * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
345          *
346          * @param source java.lang.String - the primary source zone for search
347          * @param search java.lang.String[] - the strings to search for suppression
348          * @param from java.lang.String - the secondary source zone for search
349          * @param prefix java.lang.String - the prefix to add to the search string to test
350          * @return boolean - true if modified
351          */
352         private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) {
353                 boolean found = false;
354                 if (source!=null )
355                 {
356                         for (int i = 0; i < search.length; i++) {
357                                 if ( source.endsWith( search[i] ))
358                                 {
359                                         if (from!=null && from.endsWith( prefix + search[i] ))
360                                         {
361                                                 sb.delete( sb.length() - search[i].length(), sb.length());
362                                                 found = true;
363                                                 setStrings();
364                                                 break;
365                                         }
366                                 }
367                         }
368                 }
369                 return found;
370         }
371
372         /**
373          * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
374          *
375          * @param source java.lang.String - the primary source zone for search
376          * @param search java.lang.String[] - the strings to search for suppression
377          * @param vowel boolean - true if we need a vowel before the search string
378          * @param from java.lang.String - the secondary source zone for search (where vowel could be)
379          * @return boolean - true if modified
380          */
381         private boolean deleteFromIfTestVowelBeforeIn( String source, String[] search, boolean vowel, String from ) {
382                 boolean found = false;
383                 if (source!=null && from!=null)
384                 {
385                         for (int i = 0; i < search.length; i++) {
386                                 if ( source.endsWith( search[i] ))
387                                 {
388                                         if ((search[i].length() + 1) <= from.length())
389                                         {
390                                                 boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1)));
391                                                 if (test == vowel)
392                                                 {
393                                                         sb.delete( sb.length() - search[i].length(), sb.length());
394                                                         modified = true;
395                                                         found = true;
396                                                         setStrings();
397                                                         break;
398                                                 }
399                                         }
400                                 }
401                         }
402                 }
403                 return found;
404         }
405
406         /**
407          * Delete a suffix searched in zone "source" if preceded by the prefix
408          *
409          * @param source java.lang.String - the primary source zone for search
410          * @param search java.lang.String[] - the strings to search for suppression
411          * @param prefix java.lang.String - the prefix to add to the search string to test
412          * @param without boolean - true if it will be deleted even without prefix found
413          */
414         private void deleteButSuffixFrom( String source, String[] search, String prefix, boolean without ) {
415                 if (source!=null)
416                 {
417                         for (int i = 0; i < search.length; i++) {
418                                 if ( source.endsWith( prefix + search[i] ))
419                                 {
420                                         sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
421                                         modified = true;
422                                         setStrings();
423                                         break;
424                                 }
425                                 else if ( without && source.endsWith( search[i] ))
426                                 {
427                                         sb.delete( sb.length() - search[i].length(), sb.length() );
428                                         modified = true;
429                                         setStrings();
430                                         break;
431                                 }
432                         }
433                 }
434         }
435
436         /**
437          * Delete a suffix searched in zone "source" if preceded by prefix<br>
438          * or replace it with the replace string if preceded by the prefix in the zone "from"<br>
439          * or delete the suffix if specified
440          *
441          * @param source java.lang.String - the primary source zone for search
442          * @param search java.lang.String[] - the strings to search for suppression
443          * @param prefix java.lang.String - the prefix to add to the search string to test
444          * @param without boolean - true if it will be deleted even without prefix found
445          */
446         private void deleteButSuffixFromElseReplace( String source, String[] search, String prefix, boolean without, String from, String replace ) {
447                 if (source!=null)
448                 {
449                         for (int i = 0; i < search.length; i++) {
450                                 if ( source.endsWith( prefix + search[i] ))
451                                 {
452                                         sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
453                                         modified = true;
454                                         setStrings();
455                                         break;
456                                 }
457                                 else if ( from!=null && from.endsWith( prefix + search[i] ))
458                                 {
459                                         sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace );
460                                         modified = true;
461                                         setStrings();
462                                         break;
463                                 }
464                                 else if ( without && source.endsWith( search[i] ))
465                                 {
466                                         sb.delete( sb.length() - search[i].length(), sb.length() );
467                                         modified = true;
468                                         setStrings();
469                                         break;
470                                 }
471                         }
472                 }
473         }
474
475         /**
476          * Replace a search string with another within the source zone
477          *
478          * @param source java.lang.String - the source zone for search
479          * @param search java.lang.String[] - the strings to search for replacement
480          * @param replace java.lang.String - the replacement string
481          */
482         private boolean replaceFrom( String source, String[] search, String replace ) {
483                 boolean found = false;
484                 if (source!=null)
485                 {
486                         for (int i = 0; i < search.length; i++) {
487                                 if ( source.endsWith( search[i] ))
488                                 {
489                                         sb.replace( sb.length() - search[i].length(), sb.length(), replace );
490                                         modified = true;
491                                         found = true;
492                                         setStrings();
493                                         break;
494                                 }
495                         }
496                 }
497                 return found;
498         }
499
500         /**
501          * Delete a search string within the source zone
502          *
503          * @param source the source zone for search
504          * @param suffix the strings to search for suppression
505          */
506         private void deleteFrom(String source, String[] suffix ) {
507                 if (source!=null)
508                 {
509                         for (int i = 0; i < suffix.length; i++) {
510                                 if (source.endsWith( suffix[i] ))
511                                 {
512                                         sb.delete( sb.length() - suffix[i].length(), sb.length());
513                                         modified = true;
514                                         setStrings();
515                                         break;
516                                 }
517                         }
518                 }
519         }
520
521         /**
522          * Test if a char is a french vowel, including accentuated ones
523          *
524          * @param ch the char to test
525          * @return boolean - true if the char is a vowel
526          */
527         private boolean isVowel(char ch) {
528                 switch (ch)
529                 {
530                         case 'a':
531                         case 'e':
532                         case 'i':
533                         case 'o':
534                         case 'u':
535                         case 'y':
536                         case 'â':
537                         case 'à':
538                         case 'ë':
539                         case 'é':
540                         case 'ê':
541                         case 'è':
542                         case 'ï':
543                         case 'î':
544                         case 'ô':
545                         case 'ü':
546                         case 'ù':
547                         case 'û':
548                                 return true;
549                         default:
550                                 return false;
551                 }
552         }
553
554         /**
555          * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
556          * "R is the region after the first non-vowel following a vowel
557          * or is the null region at the end of the word if there is no such non-vowel"<br>
558          * @param buffer java.lang.StringBuilder - the in buffer
559          * @return java.lang.String - the resulting string
560          */
561         private String retrieveR( StringBuilder buffer ) {
562                 int len = buffer.length();
563                 int pos = -1;
564                 for (int c = 0; c < len; c++) {
565                         if (isVowel( buffer.charAt( c )))
566                         {
567                                 pos = c;
568                                 break;
569                         }
570                 }
571                 if (pos > -1)
572                 {
573                         int consonne = -1;
574                         for (int c = pos; c < len; c++) {
575                                 if (!isVowel(buffer.charAt( c )))
576                                 {
577                                         consonne = c;
578                                         break;
579                                 }
580                         }
581                         if (consonne > -1 && (consonne+1) < len)
582                                 return buffer.substring( consonne+1, len );
583                         else
584                                 return null;
585                 }
586                 else
587                         return null;
588         }
589
590         /**
591          * Retrieve the "RV zone" from a buffer an return the corresponding string<br>
592          * "If the word begins with two vowels, RV is the region after the third letter,
593          * otherwise the region after the first vowel not at the beginning of the word,
594          * or the end of the word if these positions cannot be found."<br>
595          * @param buffer java.lang.StringBuilder - the in buffer
596          * @return java.lang.String - the resulting string
597          */
598         private String retrieveRV( StringBuilder buffer ) {
599                 int len = buffer.length();
600                 if ( buffer.length() > 3)
601                 {
602                         if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) {
603                                 return buffer.substring(3,len);
604                         }
605                         else
606                         {
607                                 int pos = 0;
608                                 for (int c = 1; c < len; c++) {
609                                         if (isVowel( buffer.charAt( c )))
610                                         {
611                                                 pos = c;
612                                                 break;
613                                         }
614                                 }
615                                 if ( pos+1 < len )
616                                         return buffer.substring( pos+1, len );
617                                 else
618                                         return null;
619                         }
620                 }
621                 else
622                         return null;
623         }
624
625
626
627     /**
628          * Turns u and i preceded AND followed by a vowel to UpperCase<br>
629          * Turns y preceded OR followed by a vowel to UpperCase<br>
630          * Turns u preceded by q to UpperCase<br>
631      *
632      * @param buffer java.util.StringBuilder - the buffer to treat
633      * @return java.util.StringBuilder - the treated buffer
634      */
635     private StringBuilder treatVowels( StringBuilder buffer ) {
636                 for ( int c = 0; c < buffer.length(); c++ ) {
637                         char ch = buffer.charAt( c );
638
639                         if (c == 0) // first char
640                         {
641                                 if (buffer.length()>1)
642                                 {
643                                         if (ch == 'y' && isVowel(buffer.charAt( c + 1 )))
644                                                 buffer.setCharAt( c, 'Y' );
645                                 }
646                         }
647                         else if (c == buffer.length()-1) // last char
648                         {
649                                 if (ch == 'u' && buffer.charAt( c - 1 ) == 'q')
650                                         buffer.setCharAt( c, 'U' );
651                                 if (ch == 'y' && isVowel(buffer.charAt( c - 1 )))
652                                         buffer.setCharAt( c, 'Y' );
653                         }
654                         else // other cases
655                         {
656                                 if (ch == 'u')
657                                 {
658                                         if (buffer.charAt( c - 1) == 'q')
659                                                 buffer.setCharAt( c, 'U' );
660                                         else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
661                                                 buffer.setCharAt( c, 'U' );
662                                 }
663                                 if (ch == 'i')
664                                 {
665                                         if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
666                                                 buffer.setCharAt( c, 'I' );
667                                 }
668                                 if (ch == 'y')
669                                 {
670                                         if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 )))
671                                                 buffer.setCharAt( c, 'Y' );
672                                 }
673                         }
674                 }
675
676                 return buffer;
677     }
678
679     /**
680      * Checks a term if it can be processed correctly.
681      *
682      * @return boolean - true if, and only if, the given term consists in letters.
683      */
684     private boolean isStemmable( String term ) {
685                 boolean upper = false;
686                 int first = -1;
687                 for ( int c = 0; c < term.length(); c++ ) {
688                         // Discard terms that contain non-letter characters.
689                         if ( !Character.isLetter( term.charAt( c ) ) ) {
690                                 return false;
691                         }
692                         // Discard terms that contain multiple uppercase letters.
693                         if ( Character.isUpperCase( term.charAt( c ) ) ) {
694                                 if ( upper ) {
695                                         return false;
696                                 }
697                         // First encountered uppercase letter, set flag and save
698                         // position.
699                                 else {
700                                         first = c;
701                                         upper = true;
702                                 }
703                         }
704                 }
705                 // Discard the term if it contains a single uppercase letter that
706                 // is not starting the term.
707                 if ( first > 0 ) {
708                         return false;
709                 }
710                 return true;
711     }
712 }