lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java

   1 package org.apache.lucene.analysis.miscellaneous;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.Reader;
  22 import java.io.StringReader;
  23 import java.util.Arrays;
  24 import java.util.Locale;
  25 import java.util.Set;
  26 import java.util.regex.Matcher;
  27 import java.util.regex.Pattern;
  28
  29 import org.apache.lucene.analysis.Analyzer;
  30 import org.apache.lucene.analysis.CharArraySet;
  31 import org.apache.lucene.analysis.StopAnalyzer;
  32 import org.apache.lucene.analysis.StopFilter;
  33 import org.apache.lucene.analysis.TokenStream;
  34 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  35 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  36 import org.apache.lucene.util.Version;
  37
  38 /**
  39  * Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
  40  * {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Pattern}
  41  * (with behaviour identical to {@link String#split(String)}),
  42  * and that combines the functionality of
  43  * {@link org.apache.lucene.analysis.LetterTokenizer},
  44  * {@link org.apache.lucene.analysis.LowerCaseTokenizer},
  45  * {@link org.apache.lucene.analysis.WhitespaceTokenizer},
  46  * {@link org.apache.lucene.analysis.StopFilter} into a single efficient
  47  * multi-purpose class.
  48  * <p>
  49  * If you are unsure how exactly a regular expression should look like, consider
  50  * prototyping by simply trying various expressions on some test texts via
  51  * {@link String#split(String)}. Once you are satisfied, give that regex to
  52  * PatternAnalyzer. Also see <a target="_blank"
  53  * href="http://java.sun.com/docs/books/tutorial/extra/regex/">Java Regular Expression Tutorial</a>.
  54  * <p>
  55  * This class can be considerably faster than the "normal" Lucene tokenizers.
  56  * It can also serve as a building block in a compound Lucene
  57  * {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this
  58  * stemming example:
  59  * <pre>
  60  * PatternAnalyzer pat = ...
  61  * TokenStream tokenStream = new SnowballFilter(
  62  *     pat.tokenStream("content", "James is running round in the woods"),
  63  *     "English"));
  64  * </pre>
  65  *
  66  */
  67 public final class PatternAnalyzer extends Analyzer {
  68
  69   /** <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c)) */
  70   public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
  71
  72   /** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
  73   public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
  74
  75   private static final CharArraySet EXTENDED_ENGLISH_STOP_WORDS =
  76     CharArraySet.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
  77         Arrays.asList(
  78       "a", "about", "above", "across", "adj", "after", "afterwards",
  79       "again", "against", "albeit", "all", "almost", "alone", "along",
  80       "already", "also", "although", "always", "among", "amongst", "an",
  81       "and", "another", "any", "anyhow", "anyone", "anything",
  82       "anywhere", "are", "around", "as", "at", "be", "became", "because",
  83       "become", "becomes", "becoming", "been", "before", "beforehand",
  84       "behind", "being", "below", "beside", "besides", "between",
  85       "beyond", "both", "but", "by", "can", "cannot", "co", "could",
  86       "down", "during", "each", "eg", "either", "else", "elsewhere",
  87       "enough", "etc", "even", "ever", "every", "everyone", "everything",
  88       "everywhere", "except", "few", "first", "for", "former",
  89       "formerly", "from", "further", "had", "has", "have", "he", "hence",
  90       "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
  91       "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
  92       "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
  93       "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
  94       "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
  95       "must", "my", "myself", "namely", "neither", "never",
  96       "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
  97       "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
  98       "once one", "only", "onto", "or", "other", "others", "otherwise",
  99       "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
 100       "rather", "s", "same", "seem", "seemed", "seeming", "seems",
 101       "several", "she", "should", "since", "so", "some", "somehow",
 102       "someone", "something", "sometime", "sometimes", "somewhere",
 103       "still", "such", "t", "than", "that", "the", "their", "them",
 104       "themselves", "then", "thence", "there", "thereafter", "thereby",
 105       "therefor", "therein", "thereupon", "these", "they", "this",
 106       "those", "though", "through", "throughout", "thru", "thus", "to",
 107       "together", "too", "toward", "towards", "under", "until", "up",
 108       "upon", "us", "very", "via", "was", "we", "well", "were", "what",
 109       "whatever", "whatsoever", "when", "whence", "whenever",
 110       "whensoever", "where", "whereafter", "whereas", "whereat",
 111       "whereby", "wherefrom", "wherein", "whereinto", "whereof",
 112       "whereon", "whereto", "whereunto", "whereupon", "wherever",
 113       "wherewith", "whether", "which", "whichever", "whichsoever",
 114       "while", "whilst", "whither", "who", "whoever", "whole", "whom",
 115       "whomever", "whomsoever", "whose", "whosoever", "why", "will",
 116       "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
 117       "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
 118       "yourselves"
 119     ), true));
 120
 121   /**
 122    * A lower-casing word analyzer with English stop words (can be shared
 123    * freely across threads without harm); global per class loader.
 124    */
 125   public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
 126     Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
 127
 128   /**
 129    * A lower-casing word analyzer with <b>extended </b> English stop words
 130    * (can be shared freely across threads without harm); global per class
 131    * loader. The stop words are borrowed from
 132    * http://thomas.loc.gov/home/stopwords.html, see
 133    * http://thomas.loc.gov/home/all.about.inquery.html
 134    */
 135   public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
 136     Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
 137
 138   private final Pattern pattern;
 139   private final boolean toLowerCase;
 140   private final Set<?> stopWords;
 141
 142   private final Version matchVersion;
 143
 144   /**
 145    * Constructs a new instance with the given parameters.
 146    *
 147    * @param matchVersion If >= {@link Version#LUCENE_29}, StopFilter.enablePositionIncrement is set to true
 148    * @param pattern
 149    *            a regular expression delimiting tokens
 150    * @param toLowerCase
 151    *            if <code>true</code> returns tokens after applying
 152    *            String.toLowerCase()
 153    * @param stopWords
 154    *            if non-null, ignores all tokens that are contained in the
 155    *            given stop set (after previously having applied toLowerCase()
 156    *            if applicable). For example, created via
 157    *            {@link StopFilter#makeStopSet(Version, String[])}and/or
 158    *            {@link org.apache.lucene.analysis.WordlistLoader}as in
 159    *            <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
 160    *            or <a href="http://www.unine.ch/info/clef/">other stop words
 161    *            lists </a>.
 162    */
 163   public PatternAnalyzer(Version matchVersion, Pattern pattern, boolean toLowerCase, Set<?> stopWords) {
 164     if (pattern == null)
 165       throw new IllegalArgumentException("pattern must not be null");
 166
 167     if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
 168     else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
 169
 170     if (stopWords != null && stopWords.size() == 0) stopWords = null;
 171
 172     this.pattern = pattern;
 173     this.toLowerCase = toLowerCase;
 174     this.stopWords = stopWords;
 175     this.matchVersion = matchVersion;
 176   }
 177
 178   /**
 179    * Creates a token stream that tokenizes the given string into token terms
 180    * (aka words).
 181    *
 182    * @param fieldName
 183    *            the name of the field to tokenize (currently ignored).
 184    * @param text
 185    *            the string to tokenize
 186    * @return a new token stream
 187    */
 188   public TokenStream tokenStream(String fieldName, String text) {
 189     // Ideally the Analyzer superclass should have a method with the same signature,
 190     // with a default impl that simply delegates to the StringReader flavour.
 191     if (text == null)
 192       throw new IllegalArgumentException("text must not be null");
 193
 194     TokenStream stream;
 195     if (pattern == NON_WORD_PATTERN) { // fast path
 196       stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
 197     }
 198     else if (pattern == WHITESPACE_PATTERN) { // fast path
 199       stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
 200     }
 201     else {
 202       stream = new PatternTokenizer(text, pattern, toLowerCase);
 203       if (stopWords != null) stream = new StopFilter(matchVersion, stream, stopWords);
 204     }
 205
 206     return stream;
 207   }
 208
 209   /**
 210    * Creates a token stream that tokenizes all the text in the given Reader;
 211    * This implementation forwards to <code>tokenStream(String, String)</code> and is
 212    * less efficient than <code>tokenStream(String, String)</code>.
 213    *
 214    * @param fieldName
 215    *            the name of the field to tokenize (currently ignored).
 216    * @param reader
 217    *            the reader delivering the text
 218    * @return a new token stream
 219    */
 220   @Override
 221   public TokenStream tokenStream(String fieldName, Reader reader) {
 222     if (reader instanceof FastStringReader) { // fast path
 223       return tokenStream(fieldName, ((FastStringReader)reader).getString());
 224     }
 225
 226     try {
 227       String text = toString(reader);
 228       return tokenStream(fieldName, text);
 229     } catch (IOException e) {
 230       throw new RuntimeException(e);
 231     }
 232   }
 233
 234   /**
 235    * Indicates whether some other object is "equal to" this one.
 236    *
 237    * @param other
 238    *            the reference object with which to compare.
 239    * @return true if equal, false otherwise
 240    */
 241   @Override
 242   public boolean equals(Object other) {
 243     if (this  == other) return true;
 244     if (this  == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
 245     if (other == DEFAULT_ANALYZER && this  == EXTENDED_ANALYZER) return false;
 246
 247     if (other instanceof PatternAnalyzer) {
 248       PatternAnalyzer p2 = (PatternAnalyzer) other;
 249       return
 250         toLowerCase == p2.toLowerCase &&
 251         eqPattern(pattern, p2.pattern) &&
 252         eq(stopWords, p2.stopWords);
 253     }
 254     return false;
 255   }
 256
 257   /**
 258    * Returns a hash code value for the object.
 259    *
 260    * @return the hash code.
 261    */
 262   @Override
 263   public int hashCode() {
 264     if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
 265     if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
 266
 267     int h = 1;
 268     h = 31*h + pattern.pattern().hashCode();
 269     h = 31*h + pattern.flags();
 270     h = 31*h + (toLowerCase ? 1231 : 1237);
 271     h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
 272     return h;
 273   }
 274
 275   /** equality where o1 and/or o2 can be null */
 276   private static boolean eq(Object o1, Object o2) {
 277     return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
 278   }
 279
 280   /** assumes p1 and p2 are not null */
 281   private static boolean eqPattern(Pattern p1, Pattern p2) {
 282     return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
 283   }
 284
 285   /**
 286    * Reads until end-of-stream and returns all read chars, finally closes the stream.
 287    *
 288    * @param input the input stream
 289    * @throws IOException if an I/O error occurs while reading the stream
 290    */
 291   private static String toString(Reader input) throws IOException {
 292     try {
 293       int len = 256;
 294       char[] buffer = new char[len];
 295       char[] output = new char[len];
 296
 297       len = 0;
 298       int n;
 299       while ((n = input.read(buffer)) >= 0) {
 300         if (len + n > output.length) { // grow capacity
 301           char[] tmp = new char[Math.max(output.length << 1, len + n)];
 302           System.arraycopy(output, 0, tmp, 0, len);
 303           System.arraycopy(buffer, 0, tmp, len, n);
 304           buffer = output; // use larger buffer for future larger bulk reads
 305           output = tmp;
 306         } else {
 307           System.arraycopy(buffer, 0, output, len, n);
 308         }
 309         len += n;
 310       }
 311
 312       return new String(output, 0, len);
 313     } finally {
 314       input.close();
 315     }
 316   }
 317
 318
 319   ///////////////////////////////////////////////////////////////////////////////
 320   // Nested classes:
 321   ///////////////////////////////////////////////////////////////////////////////
 322   /**
 323    * The work horse; performance isn't fantastic, but it's not nearly as bad
 324    * as one might think - kudos to the Sun regex developers.
 325    */
 326   private static final class PatternTokenizer extends TokenStream {
 327
 328     private final String str;
 329     private final boolean toLowerCase;
 330     private Matcher matcher;
 331     private int pos = 0;
 332     private static final Locale locale = Locale.getDefault();
 333     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 334     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 335
 336     public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
 337       this.str = str;
 338       this.matcher = pattern.matcher(str);
 339       this.toLowerCase = toLowerCase;
 340     }
 341
 342     @Override
 343     public final boolean incrementToken() {
 344       if (matcher == null) return false;
 345       clearAttributes();
 346       while (true) { // loop takes care of leading and trailing boundary cases
 347         int start = pos;
 348         int end;
 349         boolean isMatch = matcher.find();
 350         if (isMatch) {
 351           end = matcher.start();
 352           pos = matcher.end();
 353         } else {
 354           end = str.length();
 355           matcher = null; // we're finished
 356         }
 357
 358         if (start != end) { // non-empty match (header/trailer)
 359           String text = str.substring(start, end);
 360           if (toLowerCase) text = text.toLowerCase(locale);
 361           termAtt.setEmpty().append(text);
 362           offsetAtt.setOffset(start, end);
 363           return true;
 364         }
 365         if (!isMatch) return false;
 366       }
 367     }
 368
 369     @Override
 370     public final void end() {
 371       // set final offset
 372       final int finalOffset = str.length();
 373         this.offsetAtt.setOffset(finalOffset, finalOffset);
 374     }
 375   }
 376
 377
 378   ///////////////////////////////////////////////////////////////////////////////
 379   // Nested classes:
 380   ///////////////////////////////////////////////////////////////////////////////
 381   /**
 382    * Special-case class for best performance in common cases; this class is
 383    * otherwise unnecessary.
 384    */
 385   private static final class FastStringTokenizer extends TokenStream {
 386
 387     private final String str;
 388     private int pos;
 389     private final boolean isLetter;
 390     private final boolean toLowerCase;
 391     private final Set<?> stopWords;
 392     private static final Locale locale = Locale.getDefault();
 393     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 394     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 395
 396     public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
 397       this.str = str;
 398       this.isLetter = isLetter;
 399       this.toLowerCase = toLowerCase;
 400       this.stopWords = stopWords;
 401     }
 402
 403     @Override
 404     public boolean incrementToken() {
 405       clearAttributes();
 406       // cache loop instance vars (performance)
 407       String s = str;
 408       int len = s.length();
 409       int i = pos;
 410       boolean letter = isLetter;
 411
 412       int start = 0;
 413       String text;
 414       do {
 415         // find beginning of token
 416         text = null;
 417         while (i < len && !isTokenChar(s.charAt(i), letter)) {
 418           i++;
 419         }
 420
 421         if (i < len) { // found beginning; now find end of token
 422           start = i;
 423           while (i < len && isTokenChar(s.charAt(i), letter)) {
 424             i++;
 425           }
 426
 427           text = s.substring(start, i);
 428           if (toLowerCase) text = text.toLowerCase(locale);
 429 //          if (toLowerCase) {
 430 ////            use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
 431 ////            see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
 432 //            text = s.substring(start, i).toLowerCase();
 433 ////            char[] chars = new char[i-start];
 434 ////            for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
 435 ////            text = new String(chars);
 436 //          } else {
 437 //            text = s.substring(start, i);
 438 //          }
 439         }
 440       } while (text != null && isStopWord(text));
 441
 442       pos = i;
 443       if (text == null)
 444       {
 445         return false;
 446       }
 447       termAtt.setEmpty().append(text);
 448       offsetAtt.setOffset(start, i);
 449       return true;
 450     }
 451
 452     @Override
 453     public final void end() {
 454       // set final offset
 455       final int finalOffset = str.length();
 456       this.offsetAtt.setOffset(finalOffset, finalOffset);
 457     }
 458
 459     private boolean isTokenChar(char c, boolean isLetter) {
 460       return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
 461     }
 462
 463     private boolean isStopWord(String text) {
 464       return stopWords != null && stopWords.contains(text);
 465     }
 466
 467   }
 468
 469
 470   ///////////////////////////////////////////////////////////////////////////////
 471   // Nested classes:
 472   ///////////////////////////////////////////////////////////////////////////////
 473   /**
 474    * A StringReader that exposes it's contained string for fast direct access.
 475    * Might make sense to generalize this to CharSequence and make it public?
 476    */
 477   static final class FastStringReader extends StringReader {
 478
 479     private final String s;
 480
 481     FastStringReader(String s) {
 482       super(s);
 483       this.s = s;
 484     }
 485
 486     String getString() {
 487       return s;
 488     }
 489   }
 490
 491 }