X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java diff --git a/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java b/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java deleted file mode 100644 index c8d985a..0000000 --- a/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java +++ /dev/null @@ -1,491 +0,0 @@ -package org.apache.lucene.analysis.miscellaneous; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.Reader; -import java.io.StringReader; -import java.util.Arrays; -import java.util.Locale; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.CharArraySet; -import org.apache.lucene.analysis.StopAnalyzer; -import org.apache.lucene.analysis.StopFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.util.Version; - -/** - * Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a - * {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Pattern} - * (with behaviour identical to {@link String#split(String)}), - * and that combines the functionality of - * {@link org.apache.lucene.analysis.LetterTokenizer}, - * {@link org.apache.lucene.analysis.LowerCaseTokenizer}, - * {@link org.apache.lucene.analysis.WhitespaceTokenizer}, - * {@link org.apache.lucene.analysis.StopFilter} into a single efficient - * multi-purpose class. - *

- * If you are unsure how exactly a regular expression should look like, consider - * prototyping by simply trying various expressions on some test texts via - * {@link String#split(String)}. Once you are satisfied, give that regex to - * PatternAnalyzer. Also see Java Regular Expression Tutorial. - *

- * This class can be considerably faster than the "normal" Lucene tokenizers. - * It can also serve as a building block in a compound Lucene - * {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this - * stemming example: - *

- * PatternAnalyzer pat = ...
- * TokenStream tokenStream = new SnowballFilter(
- *     pat.tokenStream("content", "James is running round in the woods"), 
- *     "English"));
- * 
- * - */ -public final class PatternAnalyzer extends Analyzer { - - /** "\\W+"; Divides text at non-letters (NOT Character.isLetter(c)) */ - public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+"); - - /** "\\s+"; Divides text at whitespaces (Character.isWhitespace(c)) */ - public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+"); - - private static final CharArraySet EXTENDED_ENGLISH_STOP_WORDS = - CharArraySet.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, - Arrays.asList( - "a", "about", "above", "across", "adj", "after", "afterwards", - "again", "against", "albeit", "all", "almost", "alone", "along", - "already", "also", "although", "always", "among", "amongst", "an", - "and", "another", "any", "anyhow", "anyone", "anything", - "anywhere", "are", "around", "as", "at", "be", "became", "because", - "become", "becomes", "becoming", "been", "before", "beforehand", - "behind", "being", "below", "beside", "besides", "between", - "beyond", "both", "but", "by", "can", "cannot", "co", "could", - "down", "during", "each", "eg", "either", "else", "elsewhere", - "enough", "etc", "even", "ever", "every", "everyone", "everything", - "everywhere", "except", "few", "first", "for", "former", - "formerly", "from", "further", "had", "has", "have", "he", "hence", - "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", - "herself", "him", "himself", "his", "how", "however", "i", "ie", "if", - "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last", - "latter", "latterly", "least", "less", "ltd", "many", "may", "me", - "meanwhile", "might", "more", "moreover", "most", "mostly", "much", - "must", "my", "myself", "namely", "neither", "never", - "nevertheless", "next", "no", "nobody", "none", "noone", "nor", - "not", "nothing", "now", "nowhere", "of", "off", "often", "on", - "once one", "only", "onto", "or", "other", "others", "otherwise", - "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps", - "rather", "s", "same", "seem", "seemed", "seeming", "seems", - "several", "she", "should", "since", "so", "some", "somehow", - "someone", "something", "sometime", "sometimes", "somewhere", - "still", "such", "t", "than", "that", "the", "their", "them", - "themselves", "then", "thence", "there", "thereafter", "thereby", - "therefor", "therein", "thereupon", "these", "they", "this", - "those", "though", "through", "throughout", "thru", "thus", "to", - "together", "too", "toward", "towards", "under", "until", "up", - "upon", "us", "very", "via", "was", "we", "well", "were", "what", - "whatever", "whatsoever", "when", "whence", "whenever", - "whensoever", "where", "whereafter", "whereas", "whereat", - "whereby", "wherefrom", "wherein", "whereinto", "whereof", - "whereon", "whereto", "whereunto", "whereupon", "wherever", - "wherewith", "whether", "which", "whichever", "whichsoever", - "while", "whilst", "whither", "who", "whoever", "whole", "whom", - "whomever", "whomsoever", "whose", "whosoever", "why", "will", - "with", "within", "without", "would", "xsubj", "xcal", "xauthor", - "xother ", "xnote", "yet", "you", "your", "yours", "yourself", - "yourselves" - ), true)); - - /** - * A lower-casing word analyzer with English stop words (can be shared - * freely across threads without harm); global per class loader. - */ - public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer( - Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET); - - /** - * A lower-casing word analyzer with extended English stop words - * (can be shared freely across threads without harm); global per class - * loader. The stop words are borrowed from - * http://thomas.loc.gov/home/stopwords.html, see - * http://thomas.loc.gov/home/all.about.inquery.html - */ - public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer( - Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS); - - private final Pattern pattern; - private final boolean toLowerCase; - private final Set stopWords; - - private final Version matchVersion; - - /** - * Constructs a new instance with the given parameters. - * - * @param matchVersion If >= {@link Version#LUCENE_29}, StopFilter.enablePositionIncrement is set to true - * @param pattern - * a regular expression delimiting tokens - * @param toLowerCase - * if true returns tokens after applying - * String.toLowerCase() - * @param stopWords - * if non-null, ignores all tokens that are contained in the - * given stop set (after previously having applied toLowerCase() - * if applicable). For example, created via - * {@link StopFilter#makeStopSet(Version, String[])}and/or - * {@link org.apache.lucene.analysis.WordlistLoader}as in - * WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt") - * or other stop words - * lists . - */ - public PatternAnalyzer(Version matchVersion, Pattern pattern, boolean toLowerCase, Set stopWords) { - if (pattern == null) - throw new IllegalArgumentException("pattern must not be null"); - - if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN; - else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN; - - if (stopWords != null && stopWords.size() == 0) stopWords = null; - - this.pattern = pattern; - this.toLowerCase = toLowerCase; - this.stopWords = stopWords; - this.matchVersion = matchVersion; - } - - /** - * Creates a token stream that tokenizes the given string into token terms - * (aka words). - * - * @param fieldName - * the name of the field to tokenize (currently ignored). - * @param text - * the string to tokenize - * @return a new token stream - */ - public TokenStream tokenStream(String fieldName, String text) { - // Ideally the Analyzer superclass should have a method with the same signature, - // with a default impl that simply delegates to the StringReader flavour. - if (text == null) - throw new IllegalArgumentException("text must not be null"); - - TokenStream stream; - if (pattern == NON_WORD_PATTERN) { // fast path - stream = new FastStringTokenizer(text, true, toLowerCase, stopWords); - } - else if (pattern == WHITESPACE_PATTERN) { // fast path - stream = new FastStringTokenizer(text, false, toLowerCase, stopWords); - } - else { - stream = new PatternTokenizer(text, pattern, toLowerCase); - if (stopWords != null) stream = new StopFilter(matchVersion, stream, stopWords); - } - - return stream; - } - - /** - * Creates a token stream that tokenizes all the text in the given Reader; - * This implementation forwards to tokenStream(String, String) and is - * less efficient than tokenStream(String, String). - * - * @param fieldName - * the name of the field to tokenize (currently ignored). - * @param reader - * the reader delivering the text - * @return a new token stream - */ - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - if (reader instanceof FastStringReader) { // fast path - return tokenStream(fieldName, ((FastStringReader)reader).getString()); - } - - try { - String text = toString(reader); - return tokenStream(fieldName, text); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - /** - * Indicates whether some other object is "equal to" this one. - * - * @param other - * the reference object with which to compare. - * @return true if equal, false otherwise - */ - @Override - public boolean equals(Object other) { - if (this == other) return true; - if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false; - if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false; - - if (other instanceof PatternAnalyzer) { - PatternAnalyzer p2 = (PatternAnalyzer) other; - return - toLowerCase == p2.toLowerCase && - eqPattern(pattern, p2.pattern) && - eq(stopWords, p2.stopWords); - } - return false; - } - - /** - * Returns a hash code value for the object. - * - * @return the hash code. - */ - @Override - public int hashCode() { - if (this == DEFAULT_ANALYZER) return -1218418418; // fast path - if (this == EXTENDED_ANALYZER) return 1303507063; // fast path - - int h = 1; - h = 31*h + pattern.pattern().hashCode(); - h = 31*h + pattern.flags(); - h = 31*h + (toLowerCase ? 1231 : 1237); - h = 31*h + (stopWords != null ? stopWords.hashCode() : 0); - return h; - } - - /** equality where o1 and/or o2 can be null */ - private static boolean eq(Object o1, Object o2) { - return (o1 == o2) || (o1 != null ? o1.equals(o2) : false); - } - - /** assumes p1 and p2 are not null */ - private static boolean eqPattern(Pattern p1, Pattern p2) { - return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern())); - } - - /** - * Reads until end-of-stream and returns all read chars, finally closes the stream. - * - * @param input the input stream - * @throws IOException if an I/O error occurs while reading the stream - */ - private static String toString(Reader input) throws IOException { - try { - int len = 256; - char[] buffer = new char[len]; - char[] output = new char[len]; - - len = 0; - int n; - while ((n = input.read(buffer)) >= 0) { - if (len + n > output.length) { // grow capacity - char[] tmp = new char[Math.max(output.length << 1, len + n)]; - System.arraycopy(output, 0, tmp, 0, len); - System.arraycopy(buffer, 0, tmp, len, n); - buffer = output; // use larger buffer for future larger bulk reads - output = tmp; - } else { - System.arraycopy(buffer, 0, output, len, n); - } - len += n; - } - - return new String(output, 0, len); - } finally { - input.close(); - } - } - - - /////////////////////////////////////////////////////////////////////////////// - // Nested classes: - /////////////////////////////////////////////////////////////////////////////// - /** - * The work horse; performance isn't fantastic, but it's not nearly as bad - * as one might think - kudos to the Sun regex developers. - */ - private static final class PatternTokenizer extends TokenStream { - - private final String str; - private final boolean toLowerCase; - private Matcher matcher; - private int pos = 0; - private static final Locale locale = Locale.getDefault(); - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - - public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) { - this.str = str; - this.matcher = pattern.matcher(str); - this.toLowerCase = toLowerCase; - } - - @Override - public final boolean incrementToken() { - if (matcher == null) return false; - clearAttributes(); - while (true) { // loop takes care of leading and trailing boundary cases - int start = pos; - int end; - boolean isMatch = matcher.find(); - if (isMatch) { - end = matcher.start(); - pos = matcher.end(); - } else { - end = str.length(); - matcher = null; // we're finished - } - - if (start != end) { // non-empty match (header/trailer) - String text = str.substring(start, end); - if (toLowerCase) text = text.toLowerCase(locale); - termAtt.setEmpty().append(text); - offsetAtt.setOffset(start, end); - return true; - } - if (!isMatch) return false; - } - } - - @Override - public final void end() { - // set final offset - final int finalOffset = str.length(); - this.offsetAtt.setOffset(finalOffset, finalOffset); - } - } - - - /////////////////////////////////////////////////////////////////////////////// - // Nested classes: - /////////////////////////////////////////////////////////////////////////////// - /** - * Special-case class for best performance in common cases; this class is - * otherwise unnecessary. - */ - private static final class FastStringTokenizer extends TokenStream { - - private final String str; - private int pos; - private final boolean isLetter; - private final boolean toLowerCase; - private final Set stopWords; - private static final Locale locale = Locale.getDefault(); - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - - public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) { - this.str = str; - this.isLetter = isLetter; - this.toLowerCase = toLowerCase; - this.stopWords = stopWords; - } - - @Override - public boolean incrementToken() { - clearAttributes(); - // cache loop instance vars (performance) - String s = str; - int len = s.length(); - int i = pos; - boolean letter = isLetter; - - int start = 0; - String text; - do { - // find beginning of token - text = null; - while (i < len && !isTokenChar(s.charAt(i), letter)) { - i++; - } - - if (i < len) { // found beginning; now find end of token - start = i; - while (i < len && isTokenChar(s.charAt(i), letter)) { - i++; - } - - text = s.substring(start, i); - if (toLowerCase) text = text.toLowerCase(locale); -// if (toLowerCase) { -//// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed -//// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809 -// text = s.substring(start, i).toLowerCase(); -//// char[] chars = new char[i-start]; -//// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j)); -//// text = new String(chars); -// } else { -// text = s.substring(start, i); -// } - } - } while (text != null && isStopWord(text)); - - pos = i; - if (text == null) - { - return false; - } - termAtt.setEmpty().append(text); - offsetAtt.setOffset(start, i); - return true; - } - - @Override - public final void end() { - // set final offset - final int finalOffset = str.length(); - this.offsetAtt.setOffset(finalOffset, finalOffset); - } - - private boolean isTokenChar(char c, boolean isLetter) { - return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c); - } - - private boolean isStopWord(String text) { - return stopWords != null && stopWords.contains(text); - } - - } - - - /////////////////////////////////////////////////////////////////////////////// - // Nested classes: - /////////////////////////////////////////////////////////////////////////////// - /** - * A StringReader that exposes it's contained string for fast direct access. - * Might make sense to generalize this to CharSequence and make it public? - */ - static final class FastStringReader extends StringReader { - - private final String s; - - FastStringReader(String s) { - super(s); - this.s = s; - } - - String getString() { - return s; - } - } - -}