lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java

   1 package org.apache.lucene.analysis.fr;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.Set;
  22 import java.util.Arrays;
  23 import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs
  24 import org.apache.lucene.analysis.CharArraySet;
  25 import org.apache.lucene.analysis.TokenStream;
  26 import org.apache.lucene.analysis.TokenFilter;
  27 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  28 import org.apache.lucene.util.Version;
  29
  30 /**
  31  * Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
  32  * tokenized as "avion" (plane).
  33  *
  34  * @see <a href="http://fr.wikipedia.org/wiki/%C3%89lision">Elision in Wikipedia</a>
  35  */
  36 public final class ElisionFilter extends TokenFilter {
  37   private CharArraySet articles = CharArraySet.EMPTY_SET;
  38   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  39   private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
  40       new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(
  41           "l", "m", "t", "qu", "n", "s", "j"), true));
  42
  43   private static char[] apostrophes = {'\'', '\u2019'};
  44
  45   /**
  46    * Set the stopword articles
  47    * @param matchVersion the lucene backwards compatibility version
  48    * @param articles a set of articles
  49    * @deprecated use {@link #ElisionFilter(Version, TokenStream, Set)} instead
  50    */
  51   @Deprecated
  52   public void setArticles(Version matchVersion, Set<?> articles) {
  53     this.articles = CharArraySet.unmodifiableSet(
  54         CharArraySet.copy(matchVersion, articles));
  55   }
  56
  57   /**
  58    * Set the stopword articles
  59    * @param articles a set of articles
  60    * @deprecated use {@link #setArticles(Version, Set)} instead
  61    */
  62   @Deprecated
  63   public void setArticles(Set<?> articles) {
  64     setArticles(Version.LUCENE_CURRENT, articles);
  65   }
  66   /**
  67    * Constructs an elision filter with standard stop words
  68    */
  69   public ElisionFilter(Version matchVersion, TokenStream input) {
  70     this(matchVersion, input, DEFAULT_ARTICLES);
  71   }
  72
  73   /**
  74    * Constructs an elision filter with standard stop words
  75    * @deprecated use {@link #ElisionFilter(Version, TokenStream)} instead
  76    */
  77   @Deprecated
  78   public ElisionFilter(TokenStream input) {
  79     this(Version.LUCENE_30, input);
  80   }
  81
  82   /**
  83    * Constructs an elision filter with a Set of stop words
  84    * @deprecated use {@link #ElisionFilter(Version, TokenStream, Set)} instead
  85    */
  86   @Deprecated
  87   public ElisionFilter(TokenStream input, Set<?> articles) {
  88     this(Version.LUCENE_30, input, articles);
  89   }
  90
  91   /**
  92    * Constructs an elision filter with a Set of stop words
  93    * @param matchVersion the lucene backwards compatibility version
  94    * @param input the source {@link TokenStream}
  95    * @param articles a set of stopword articles
  96    */
  97   public ElisionFilter(Version matchVersion, TokenStream input, Set<?> articles) {
  98     super(input);
  99     this.articles = CharArraySet.unmodifiableSet(
 100         new CharArraySet(matchVersion, articles, true));
 101   }
 102
 103   /**
 104    * Constructs an elision filter with an array of stop words
 105    * @deprecated use {@link #ElisionFilter(Version, TokenStream, Set)} instead
 106    */
 107   @Deprecated
 108   public ElisionFilter(TokenStream input, String[] articles) {
 109     this(Version.LUCENE_CURRENT, input,
 110         new CharArraySet(Version.LUCENE_CURRENT,
 111             Arrays.asList(articles), true));
 112   }
 113
 114   /**
 115    * Increments the {@link TokenStream} with a {@link CharTermAttribute} without elisioned start
 116    */
 117   @Override
 118   public final boolean incrementToken() throws IOException {
 119     if (input.incrementToken()) {
 120       char[] termBuffer = termAtt.buffer();
 121       int termLength = termAtt.length();
 122
 123       int minPoz = Integer.MAX_VALUE;
 124       for (int i = 0; i < apostrophes.length; i++) {
 125         char apos = apostrophes[i];
 126         // The equivalent of String.indexOf(ch)
 127         for (int poz = 0; poz < termLength ; poz++) {
 128           if (termBuffer[poz] == apos) {
 129             minPoz = Math.min(poz, minPoz);
 130             break;
 131           }
 132         }
 133       }
 134
 135       // An apostrophe has been found. If the prefix is an article strip it off.
 136       if (minPoz != Integer.MAX_VALUE
 137           && articles.contains(termAtt.buffer(), 0, minPoz)) {
 138         termAtt.copyBuffer(termAtt.buffer(), minPoz + 1, termAtt.length() - (minPoz + 1));
 139       }
 140
 141       return true;
 142     } else {
 143       return false;
 144     }
 145   }
 146 }