1 package org.apache.lucene.analysis.fr;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 * A stemmer for French words.
23 * The algorithm is based on the work of
24 * Dr Martin Porter on his snowball project<br>
25 * refer to http://snowball.sourceforge.net/french/stemmer.html<br>
26 * (French stemming algorithm) for details
28 * @deprecated Use {@link org.tartarus.snowball.ext.FrenchStemmer} instead,
29 * which has the same functionality. This filter will be removed in Lucene 5.0
32 public class FrenchStemmer {
35 * Buffer for the terms while stemming them.
37 private StringBuilder sb = new StringBuilder();
40 * A temporary buffer, used to reconstruct R2
42 private StringBuilder tb = new StringBuilder();
45 * Region R0 is equal to the whole buffer
51 * "If the word begins with two vowels, RV is the region after the third letter,
52 * otherwise the region after the first vowel not at the beginning of the word,
53 * or the end of the word if these positions cannot be found."
59 * "R1 is the region after the first non-vowel following a vowel
60 * or is the null region at the end of the word if there is no such non-vowel"
66 * "R2 is the region after the first non-vowel in R1 following a vowel
67 * or is the null region at the end of the word if there is no such non-vowel"
73 * Set to true if we need to perform step 2
75 private boolean suite;
78 * Set to true if the buffer was modified
80 private boolean modified;
84 * Stems the given term to a unique <tt>discriminator</tt>.
86 * @param term java.langString The term that should be stemmed
87 * @return java.lang.String Discriminator for <tt>term</tt>
89 protected String stem( String term ) {
90 if ( !isStemmable( term ) ) {
94 // Use lowercase for medium stemming.
95 term = term.toLowerCase();
97 // Reset the StringBuilder.
98 sb.delete( 0, sb.length() );
101 // reset the booleans
105 sb = treatVowels( sb );
111 if (!modified || suite)
121 if (modified || suite)
130 return sb.toString();
134 * Sets the search region Strings<br>
135 * it needs to be done each time the buffer was modified
137 private void setStrings() {
140 RV = retrieveRV( sb );
141 R1 = retrieveR( sb );
144 tb.delete( 0, tb.length() );
146 R2 = retrieveR( tb );
153 * First step of the Porter Algorithm<br>
154 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
156 private void step1( ) {
157 String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
158 deleteFrom( R2, suffix );
160 replaceFrom( R2, new String[] { "logies", "logie" }, "log" );
161 replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
162 replaceFrom( R2, new String[] { "ences", "ence" }, "ent" );
164 String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
165 deleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" );
167 deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
168 deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
169 deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
170 deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
171 deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );
173 deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
174 deleteFrom( RV, new String[] { "ements", "ement" } );
176 deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" );
177 deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" );
178 deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true );
180 String[] autre = { "ifs", "ives", "if", "ive" };
181 deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
182 deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
184 replaceFrom( R0, new String[] { "eaux" }, "eau" );
186 replaceFrom( R1, new String[] { "aux" }, "al" );
188 deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );
190 deleteFrom( R2, new String[] { "eux" } );
192 // if one of the next steps is performed, we will need to perform step2a
193 boolean temp = false;
194 temp = replaceFrom( RV, new String[] { "amment" }, "ant" );
197 temp = replaceFrom( RV, new String[] { "emment" }, "ent" );
200 temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
207 * Second step (A) of the Porter Algorithm<br>
208 * Will be performed if nothing changed from the first step
209 * or changed were done in the amment, emment, ments or ment suffixes<br>
210 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
212 * @return boolean - true if something changed in the StringBuilder
214 private boolean step2a() {
215 String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira",
216 "irent", "iriez", "irez", "irions", "irons", "iront",
217 "issaIent", "issais", "issantes", "issante", "issants", "issant",
218 "issait", "issais", "issions", "issons", "issiez", "issez", "issent",
219 "isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" };
220 return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
224 * Second step (B) of the Porter Algorithm<br>
225 * Will be performed if step 2 A was performed unsuccessfully<br>
226 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
228 private void step2b() {
229 String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
230 "erons", "eront","erez", "èrent", "era", "ées", "iez",
231 "ée", "és", "er", "ez", "é" };
232 deleteFrom( RV, suffix );
234 String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
235 "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",
236 "ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
237 deleteButSuffixFrom( RV, search, "e", true );
239 deleteFrom( R2, new String[] { "ions" } );
243 * Third step of the Porter Algorithm<br>
244 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
246 private void step3() {
249 char ch = sb.charAt( sb.length()-1 );
252 sb.setCharAt( sb.length()-1, 'i' );
257 sb.setCharAt( sb.length()-1, 'c' );
264 * Fourth step of the Porter Algorithm<br>
265 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
267 private void step4() {
270 char ch = sb.charAt( sb.length()-1 );
273 char b = sb.charAt( sb.length()-2 );
274 if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')
276 sb.delete( sb.length() - 1, sb.length());
281 boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
283 found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
285 replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
286 deleteFrom( RV, new String[] { "e" } );
287 deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
291 * Fifth step of the Porter Algorithm<br>
292 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
294 private void step5() {
297 if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill"))
299 sb.delete( sb.length() - 1, sb.length() );
306 * Sixth (and last!) step of the Porter Algorithm<br>
307 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
309 private void step6() {
310 if (R0!=null && R0.length()>0)
312 boolean seenVowel = false;
313 boolean seenConson = false;
315 for (int i = R0.length()-1; i > -1; i--)
317 char ch = R0.charAt(i);
322 if (ch == 'é' || ch == 'è')
338 if (pos > -1 && seenConson && !seenVowel)
339 sb.setCharAt(pos, 'e');
344 * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
346 * @param source java.lang.String - the primary source zone for search
347 * @param search java.lang.String[] - the strings to search for suppression
348 * @param from java.lang.String - the secondary source zone for search
349 * @param prefix java.lang.String - the prefix to add to the search string to test
350 * @return boolean - true if modified
352 private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) {
353 boolean found = false;
356 for (int i = 0; i < search.length; i++) {
357 if ( source.endsWith( search[i] ))
359 if (from!=null && from.endsWith( prefix + search[i] ))
361 sb.delete( sb.length() - search[i].length(), sb.length());
373 * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
375 * @param source java.lang.String - the primary source zone for search
376 * @param search java.lang.String[] - the strings to search for suppression
377 * @param vowel boolean - true if we need a vowel before the search string
378 * @param from java.lang.String - the secondary source zone for search (where vowel could be)
379 * @return boolean - true if modified
381 private boolean deleteFromIfTestVowelBeforeIn( String source, String[] search, boolean vowel, String from ) {
382 boolean found = false;
383 if (source!=null && from!=null)
385 for (int i = 0; i < search.length; i++) {
386 if ( source.endsWith( search[i] ))
388 if ((search[i].length() + 1) <= from.length())
390 boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1)));
393 sb.delete( sb.length() - search[i].length(), sb.length());
407 * Delete a suffix searched in zone "source" if preceded by the prefix
409 * @param source java.lang.String - the primary source zone for search
410 * @param search java.lang.String[] - the strings to search for suppression
411 * @param prefix java.lang.String - the prefix to add to the search string to test
412 * @param without boolean - true if it will be deleted even without prefix found
414 private void deleteButSuffixFrom( String source, String[] search, String prefix, boolean without ) {
417 for (int i = 0; i < search.length; i++) {
418 if ( source.endsWith( prefix + search[i] ))
420 sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
425 else if ( without && source.endsWith( search[i] ))
427 sb.delete( sb.length() - search[i].length(), sb.length() );
437 * Delete a suffix searched in zone "source" if preceded by prefix<br>
438 * or replace it with the replace string if preceded by the prefix in the zone "from"<br>
439 * or delete the suffix if specified
441 * @param source java.lang.String - the primary source zone for search
442 * @param search java.lang.String[] - the strings to search for suppression
443 * @param prefix java.lang.String - the prefix to add to the search string to test
444 * @param without boolean - true if it will be deleted even without prefix found
446 private void deleteButSuffixFromElseReplace( String source, String[] search, String prefix, boolean without, String from, String replace ) {
449 for (int i = 0; i < search.length; i++) {
450 if ( source.endsWith( prefix + search[i] ))
452 sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
457 else if ( from!=null && from.endsWith( prefix + search[i] ))
459 sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace );
464 else if ( without && source.endsWith( search[i] ))
466 sb.delete( sb.length() - search[i].length(), sb.length() );
476 * Replace a search string with another within the source zone
478 * @param source java.lang.String - the source zone for search
479 * @param search java.lang.String[] - the strings to search for replacement
480 * @param replace java.lang.String - the replacement string
482 private boolean replaceFrom( String source, String[] search, String replace ) {
483 boolean found = false;
486 for (int i = 0; i < search.length; i++) {
487 if ( source.endsWith( search[i] ))
489 sb.replace( sb.length() - search[i].length(), sb.length(), replace );
501 * Delete a search string within the source zone
503 * @param source the source zone for search
504 * @param suffix the strings to search for suppression
506 private void deleteFrom(String source, String[] suffix ) {
509 for (int i = 0; i < suffix.length; i++) {
510 if (source.endsWith( suffix[i] ))
512 sb.delete( sb.length() - suffix[i].length(), sb.length());
522 * Test if a char is a french vowel, including accentuated ones
524 * @param ch the char to test
525 * @return boolean - true if the char is a vowel
527 private boolean isVowel(char ch) {
555 * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
556 * "R is the region after the first non-vowel following a vowel
557 * or is the null region at the end of the word if there is no such non-vowel"<br>
558 * @param buffer java.lang.StringBuilder - the in buffer
559 * @return java.lang.String - the resulting string
561 private String retrieveR( StringBuilder buffer ) {
562 int len = buffer.length();
564 for (int c = 0; c < len; c++) {
565 if (isVowel( buffer.charAt( c )))
574 for (int c = pos; c < len; c++) {
575 if (!isVowel(buffer.charAt( c )))
581 if (consonne > -1 && (consonne+1) < len)
582 return buffer.substring( consonne+1, len );
591 * Retrieve the "RV zone" from a buffer an return the corresponding string<br>
592 * "If the word begins with two vowels, RV is the region after the third letter,
593 * otherwise the region after the first vowel not at the beginning of the word,
594 * or the end of the word if these positions cannot be found."<br>
595 * @param buffer java.lang.StringBuilder - the in buffer
596 * @return java.lang.String - the resulting string
598 private String retrieveRV( StringBuilder buffer ) {
599 int len = buffer.length();
600 if ( buffer.length() > 3)
602 if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) {
603 return buffer.substring(3,len);
608 for (int c = 1; c < len; c++) {
609 if (isVowel( buffer.charAt( c )))
616 return buffer.substring( pos+1, len );
628 * Turns u and i preceded AND followed by a vowel to UpperCase<br>
629 * Turns y preceded OR followed by a vowel to UpperCase<br>
630 * Turns u preceded by q to UpperCase<br>
632 * @param buffer java.util.StringBuilder - the buffer to treat
633 * @return java.util.StringBuilder - the treated buffer
635 private StringBuilder treatVowels( StringBuilder buffer ) {
636 for ( int c = 0; c < buffer.length(); c++ ) {
637 char ch = buffer.charAt( c );
639 if (c == 0) // first char
641 if (buffer.length()>1)
643 if (ch == 'y' && isVowel(buffer.charAt( c + 1 )))
644 buffer.setCharAt( c, 'Y' );
647 else if (c == buffer.length()-1) // last char
649 if (ch == 'u' && buffer.charAt( c - 1 ) == 'q')
650 buffer.setCharAt( c, 'U' );
651 if (ch == 'y' && isVowel(buffer.charAt( c - 1 )))
652 buffer.setCharAt( c, 'Y' );
658 if (buffer.charAt( c - 1) == 'q')
659 buffer.setCharAt( c, 'U' );
660 else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
661 buffer.setCharAt( c, 'U' );
665 if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
666 buffer.setCharAt( c, 'I' );
670 if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 )))
671 buffer.setCharAt( c, 'Y' );
680 * Checks a term if it can be processed correctly.
682 * @return boolean - true if, and only if, the given term consists in letters.
684 private boolean isStemmable( String term ) {
685 boolean upper = false;
687 for ( int c = 0; c < term.length(); c++ ) {
688 // Discard terms that contain non-letter characters.
689 if ( !Character.isLetter( term.charAt( c ) ) ) {
692 // Discard terms that contain multiple uppercase letters.
693 if ( Character.isUpperCase( term.charAt( c ) ) ) {
697 // First encountered uppercase letter, set flag and save
705 // Discard the term if it contains a single uppercase letter that
706 // is not starting the term.