1 package org.apache.lucene.analysis.miscellaneous;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.Reader;
22 import java.io.StringReader;
23 import java.util.Arrays;
24 import java.util.Locale;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
29 import org.apache.lucene.analysis.Analyzer;
30 import org.apache.lucene.analysis.CharArraySet;
31 import org.apache.lucene.analysis.StopAnalyzer;
32 import org.apache.lucene.analysis.StopFilter;
33 import org.apache.lucene.analysis.TokenStream;
34 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
35 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
36 import org.apache.lucene.util.Version;
39 * Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
40 * {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Pattern}
41 * (with behaviour identical to {@link String#split(String)}),
42 * and that combines the functionality of
43 * {@link org.apache.lucene.analysis.LetterTokenizer},
44 * {@link org.apache.lucene.analysis.LowerCaseTokenizer},
45 * {@link org.apache.lucene.analysis.WhitespaceTokenizer},
46 * {@link org.apache.lucene.analysis.StopFilter} into a single efficient
47 * multi-purpose class.
49 * If you are unsure how exactly a regular expression should look like, consider
50 * prototyping by simply trying various expressions on some test texts via
51 * {@link String#split(String)}. Once you are satisfied, give that regex to
52 * PatternAnalyzer. Also see <a target="_blank"
53 * href="http://java.sun.com/docs/books/tutorial/extra/regex/">Java Regular Expression Tutorial</a>.
55 * This class can be considerably faster than the "normal" Lucene tokenizers.
56 * It can also serve as a building block in a compound Lucene
57 * {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this
60 * PatternAnalyzer pat = ...
61 * TokenStream tokenStream = new SnowballFilter(
62 * pat.tokenStream("content", "James is running round in the woods"),
67 public final class PatternAnalyzer extends Analyzer {
69 /** <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c)) */
70 public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
72 /** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
73 public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
75 private static final CharArraySet EXTENDED_ENGLISH_STOP_WORDS =
76 CharArraySet.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
78 "a", "about", "above", "across", "adj", "after", "afterwards",
79 "again", "against", "albeit", "all", "almost", "alone", "along",
80 "already", "also", "although", "always", "among", "amongst", "an",
81 "and", "another", "any", "anyhow", "anyone", "anything",
82 "anywhere", "are", "around", "as", "at", "be", "became", "because",
83 "become", "becomes", "becoming", "been", "before", "beforehand",
84 "behind", "being", "below", "beside", "besides", "between",
85 "beyond", "both", "but", "by", "can", "cannot", "co", "could",
86 "down", "during", "each", "eg", "either", "else", "elsewhere",
87 "enough", "etc", "even", "ever", "every", "everyone", "everything",
88 "everywhere", "except", "few", "first", "for", "former",
89 "formerly", "from", "further", "had", "has", "have", "he", "hence",
90 "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
91 "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
92 "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
93 "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
94 "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
95 "must", "my", "myself", "namely", "neither", "never",
96 "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
97 "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
98 "once one", "only", "onto", "or", "other", "others", "otherwise",
99 "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
100 "rather", "s", "same", "seem", "seemed", "seeming", "seems",
101 "several", "she", "should", "since", "so", "some", "somehow",
102 "someone", "something", "sometime", "sometimes", "somewhere",
103 "still", "such", "t", "than", "that", "the", "their", "them",
104 "themselves", "then", "thence", "there", "thereafter", "thereby",
105 "therefor", "therein", "thereupon", "these", "they", "this",
106 "those", "though", "through", "throughout", "thru", "thus", "to",
107 "together", "too", "toward", "towards", "under", "until", "up",
108 "upon", "us", "very", "via", "was", "we", "well", "were", "what",
109 "whatever", "whatsoever", "when", "whence", "whenever",
110 "whensoever", "where", "whereafter", "whereas", "whereat",
111 "whereby", "wherefrom", "wherein", "whereinto", "whereof",
112 "whereon", "whereto", "whereunto", "whereupon", "wherever",
113 "wherewith", "whether", "which", "whichever", "whichsoever",
114 "while", "whilst", "whither", "who", "whoever", "whole", "whom",
115 "whomever", "whomsoever", "whose", "whosoever", "why", "will",
116 "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
117 "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
122 * A lower-casing word analyzer with English stop words (can be shared
123 * freely across threads without harm); global per class loader.
125 public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
126 Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
129 * A lower-casing word analyzer with <b>extended </b> English stop words
130 * (can be shared freely across threads without harm); global per class
131 * loader. The stop words are borrowed from
132 * http://thomas.loc.gov/home/stopwords.html, see
133 * http://thomas.loc.gov/home/all.about.inquery.html
135 public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
136 Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
138 private final Pattern pattern;
139 private final boolean toLowerCase;
140 private final Set<?> stopWords;
142 private final Version matchVersion;
145 * Constructs a new instance with the given parameters.
147 * @param matchVersion If >= {@link Version#LUCENE_29}, StopFilter.enablePositionIncrement is set to true
149 * a regular expression delimiting tokens
151 * if <code>true</code> returns tokens after applying
152 * String.toLowerCase()
154 * if non-null, ignores all tokens that are contained in the
155 * given stop set (after previously having applied toLowerCase()
156 * if applicable). For example, created via
157 * {@link StopFilter#makeStopSet(Version, String[])}and/or
158 * {@link org.apache.lucene.analysis.WordlistLoader}as in
159 * <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
160 * or <a href="http://www.unine.ch/info/clef/">other stop words
163 public PatternAnalyzer(Version matchVersion, Pattern pattern, boolean toLowerCase, Set<?> stopWords) {
165 throw new IllegalArgumentException("pattern must not be null");
167 if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
168 else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
170 if (stopWords != null && stopWords.size() == 0) stopWords = null;
172 this.pattern = pattern;
173 this.toLowerCase = toLowerCase;
174 this.stopWords = stopWords;
175 this.matchVersion = matchVersion;
179 * Creates a token stream that tokenizes the given string into token terms
183 * the name of the field to tokenize (currently ignored).
185 * the string to tokenize
186 * @return a new token stream
188 public TokenStream tokenStream(String fieldName, String text) {
189 // Ideally the Analyzer superclass should have a method with the same signature,
190 // with a default impl that simply delegates to the StringReader flavour.
192 throw new IllegalArgumentException("text must not be null");
195 if (pattern == NON_WORD_PATTERN) { // fast path
196 stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
198 else if (pattern == WHITESPACE_PATTERN) { // fast path
199 stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
202 stream = new PatternTokenizer(text, pattern, toLowerCase);
203 if (stopWords != null) stream = new StopFilter(matchVersion, stream, stopWords);
210 * Creates a token stream that tokenizes all the text in the given Reader;
211 * This implementation forwards to <code>tokenStream(String, String)</code> and is
212 * less efficient than <code>tokenStream(String, String)</code>.
215 * the name of the field to tokenize (currently ignored).
217 * the reader delivering the text
218 * @return a new token stream
221 public TokenStream tokenStream(String fieldName, Reader reader) {
222 if (reader instanceof FastStringReader) { // fast path
223 return tokenStream(fieldName, ((FastStringReader)reader).getString());
227 String text = toString(reader);
228 return tokenStream(fieldName, text);
229 } catch (IOException e) {
230 throw new RuntimeException(e);
235 * Indicates whether some other object is "equal to" this one.
238 * the reference object with which to compare.
239 * @return true if equal, false otherwise
242 public boolean equals(Object other) {
243 if (this == other) return true;
244 if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
245 if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false;
247 if (other instanceof PatternAnalyzer) {
248 PatternAnalyzer p2 = (PatternAnalyzer) other;
250 toLowerCase == p2.toLowerCase &&
251 eqPattern(pattern, p2.pattern) &&
252 eq(stopWords, p2.stopWords);
258 * Returns a hash code value for the object.
260 * @return the hash code.
263 public int hashCode() {
264 if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
265 if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
268 h = 31*h + pattern.pattern().hashCode();
269 h = 31*h + pattern.flags();
270 h = 31*h + (toLowerCase ? 1231 : 1237);
271 h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
275 /** equality where o1 and/or o2 can be null */
276 private static boolean eq(Object o1, Object o2) {
277 return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
280 /** assumes p1 and p2 are not null */
281 private static boolean eqPattern(Pattern p1, Pattern p2) {
282 return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
286 * Reads until end-of-stream and returns all read chars, finally closes the stream.
288 * @param input the input stream
289 * @throws IOException if an I/O error occurs while reading the stream
291 private static String toString(Reader input) throws IOException {
294 char[] buffer = new char[len];
295 char[] output = new char[len];
299 while ((n = input.read(buffer)) >= 0) {
300 if (len + n > output.length) { // grow capacity
301 char[] tmp = new char[Math.max(output.length << 1, len + n)];
302 System.arraycopy(output, 0, tmp, 0, len);
303 System.arraycopy(buffer, 0, tmp, len, n);
304 buffer = output; // use larger buffer for future larger bulk reads
307 System.arraycopy(buffer, 0, output, len, n);
312 return new String(output, 0, len);
319 ///////////////////////////////////////////////////////////////////////////////
321 ///////////////////////////////////////////////////////////////////////////////
323 * The work horse; performance isn't fantastic, but it's not nearly as bad
324 * as one might think - kudos to the Sun regex developers.
326 private static final class PatternTokenizer extends TokenStream {
328 private final String str;
329 private final boolean toLowerCase;
330 private Matcher matcher;
332 private static final Locale locale = Locale.getDefault();
333 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
334 private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
336 public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
338 this.matcher = pattern.matcher(str);
339 this.toLowerCase = toLowerCase;
343 public final boolean incrementToken() {
344 if (matcher == null) return false;
346 while (true) { // loop takes care of leading and trailing boundary cases
349 boolean isMatch = matcher.find();
351 end = matcher.start();
355 matcher = null; // we're finished
358 if (start != end) { // non-empty match (header/trailer)
359 String text = str.substring(start, end);
360 if (toLowerCase) text = text.toLowerCase(locale);
361 termAtt.setEmpty().append(text);
362 offsetAtt.setOffset(start, end);
365 if (!isMatch) return false;
370 public final void end() {
372 final int finalOffset = str.length();
373 this.offsetAtt.setOffset(finalOffset, finalOffset);
378 ///////////////////////////////////////////////////////////////////////////////
380 ///////////////////////////////////////////////////////////////////////////////
382 * Special-case class for best performance in common cases; this class is
383 * otherwise unnecessary.
385 private static final class FastStringTokenizer extends TokenStream {
387 private final String str;
389 private final boolean isLetter;
390 private final boolean toLowerCase;
391 private final Set<?> stopWords;
392 private static final Locale locale = Locale.getDefault();
393 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
394 private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
396 public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
398 this.isLetter = isLetter;
399 this.toLowerCase = toLowerCase;
400 this.stopWords = stopWords;
404 public boolean incrementToken() {
406 // cache loop instance vars (performance)
408 int len = s.length();
410 boolean letter = isLetter;
415 // find beginning of token
417 while (i < len && !isTokenChar(s.charAt(i), letter)) {
421 if (i < len) { // found beginning; now find end of token
423 while (i < len && isTokenChar(s.charAt(i), letter)) {
427 text = s.substring(start, i);
428 if (toLowerCase) text = text.toLowerCase(locale);
429 // if (toLowerCase) {
430 //// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
431 //// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
432 // text = s.substring(start, i).toLowerCase();
433 //// char[] chars = new char[i-start];
434 //// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
435 //// text = new String(chars);
437 // text = s.substring(start, i);
440 } while (text != null && isStopWord(text));
447 termAtt.setEmpty().append(text);
448 offsetAtt.setOffset(start, i);
453 public final void end() {
455 final int finalOffset = str.length();
456 this.offsetAtt.setOffset(finalOffset, finalOffset);
459 private boolean isTokenChar(char c, boolean isLetter) {
460 return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
463 private boolean isStopWord(String text) {
464 return stopWords != null && stopWords.contains(text);
470 ///////////////////////////////////////////////////////////////////////////////
472 ///////////////////////////////////////////////////////////////////////////////
474 * A StringReader that exposes it's contained string for fast direct access.
475 * Might make sense to generalize this to CharSequence and make it public?
477 static final class FastStringReader extends StringReader {
479 private final String s;
481 FastStringReader(String s) {