+++ /dev/null
-package org.apache.lucene.queryParser.analyzing;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.queryParser.ParseException;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.util.Version;
-
-/**
- * Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
- * are also passed through the given analyzer, but wild card characters (like <code>*</code>)
- * don't get removed from the search terms.
- *
- * <p><b>Warning:</b> This class should only be used with analyzers that do not use stopwords
- * or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer
- * will turn <code>Häuser</code> into <code>hau</code>, but <code>H?user</code> will
- * become <code>h?user</code> when using this parser and thus no match would be found (i.e.
- * using this parser will be no improvement over QueryParser in such cases).
- *
- * @version $Revision$, $Date$
- */
-public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryParser {
-
- /**
- * Constructs a query parser.
- * @param field the default field for query terms.
- * @param analyzer used to find terms in the query text.
- */
- public AnalyzingQueryParser(Version matchVersion, String field, Analyzer analyzer) {
- super(matchVersion, field, analyzer);
- }
-
- /**
- * Called when parser
- * parses an input term token that contains one or more wildcard
- * characters (like <code>*</code>), but is not a prefix term token (one
- * that has just a single * character at the end).
- * <p>
- * Example: will be called for <code>H?user</code> or for <code>H*user</code>
- * but not for <code>*user</code>.
- * <p>
- * Depending on analyzer and settings, a wildcard term may (most probably will)
- * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
- * <p>
- * Overrides super class, by passing terms through analyzer.
- *
- * @param field Name of the field query will use.
- * @param termStr Term token that contains one or more wild card
- * characters (? or *), but is not simple prefix term
- *
- * @return Resulting {@link Query} built for the term
- * @throws ParseException
- */
- @Override
- protected Query getWildcardQuery(String field, String termStr) throws ParseException {
- List<String> tlist = new ArrayList<String>();
- List<String> wlist = new ArrayList<String>();
- /* somewhat a hack: find/store wildcard chars
- * in order to put them back after analyzing */
- boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*"));
- StringBuilder tmpBuffer = new StringBuilder();
- char[] chars = termStr.toCharArray();
- for (int i = 0; i < termStr.length(); i++) {
- if (chars[i] == '?' || chars[i] == '*') {
- if (isWithinToken) {
- tlist.add(tmpBuffer.toString());
- tmpBuffer.setLength(0);
- }
- isWithinToken = false;
- } else {
- if (!isWithinToken) {
- wlist.add(tmpBuffer.toString());
- tmpBuffer.setLength(0);
- }
- isWithinToken = true;
- }
- tmpBuffer.append(chars[i]);
- }
- if (isWithinToken) {
- tlist.add(tmpBuffer.toString());
- } else {
- wlist.add(tmpBuffer.toString());
- }
-
- // get Analyzer from superclass and tokenize the term
- TokenStream source;
-
- int countTokens = 0;
- try {
- source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
- source.reset();
- } catch (IOException e1) {
- throw new RuntimeException(e1);
- }
- CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
- while (true) {
- try {
- if (!source.incrementToken()) break;
- } catch (IOException e) {
- break;
- }
- String term = termAtt.toString();
- if (!"".equals(term)) {
- try {
- tlist.set(countTokens++, term);
- } catch (IndexOutOfBoundsException ioobe) {
- countTokens = -1;
- }
- }
- }
- try {
- source.end();
- source.close();
- } catch (IOException e) {
- // ignore
- }
-
- if (countTokens != tlist.size()) {
- /* this means that the analyzer used either added or consumed
- * (common for a stemmer) tokens, and we can't build a WildcardQuery */
- throw new ParseException("Cannot build WildcardQuery with analyzer "
- + getAnalyzer().getClass() + " - tokens added or lost");
- }
-
- if (tlist.size() == 0) {
- return null;
- } else if (tlist.size() == 1) {
- if (wlist != null && wlist.size() == 1) {
- /* if wlist contains one wildcard, it must be at the end, because:
- * 1) wildcards are not allowed in 1st position of a term by QueryParser
- * 2) if wildcard was *not* in end, there would be *two* or more tokens */
- return super.getWildcardQuery(field, tlist.get(0)
- + wlist.get(0).toString());
- } else {
- /* we should never get here! if so, this method was called
- * with a termStr containing no wildcard ... */
- throw new IllegalArgumentException("getWildcardQuery called without wildcard");
- }
- } else {
- /* the term was tokenized, let's rebuild to one token
- * with wildcards put back in postion */
- StringBuilder sb = new StringBuilder();
- for (int i = 0; i < tlist.size(); i++) {
- sb.append( tlist.get(i));
- if (wlist != null && wlist.size() > i) {
- sb.append(wlist.get(i));
- }
- }
- return super.getWildcardQuery(field, sb.toString());
- }
- }
-
- /**
- * Called when parser parses an input term
- * token that uses prefix notation; that is, contains a single '*' wildcard
- * character as its last character. Since this is a special case
- * of generic wildcard term, and such a query can be optimized easily,
- * this usually results in a different query object.
- * <p>
- * Depending on analyzer and settings, a prefix term may (most probably will)
- * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
- * <p>
- * Overrides super class, by passing terms through analyzer.
- *
- * @param field Name of the field query will use.
- * @param termStr Term token to use for building term for the query
- * (<b>without</b> trailing '*' character!)
- *
- * @return Resulting {@link Query} built for the term
- * @throws ParseException
- */
- @Override
- protected Query getPrefixQuery(String field, String termStr) throws ParseException {
- // get Analyzer from superclass and tokenize the term
- TokenStream source;
- List<String> tlist = new ArrayList<String>();
- try {
- source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
- source.reset();
- } catch (IOException e1) {
- throw new RuntimeException(e1);
- }
- CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
- while (true) {
- try {
- if (!source.incrementToken()) break;
- } catch (IOException e) {
- break;
- }
- tlist.add(termAtt.toString());
- }
-
- try {
- source.end();
- source.close();
- } catch (IOException e) {
- // ignore
- }
-
- if (tlist.size() == 1) {
- return super.getPrefixQuery(field, tlist.get(0));
- } else {
- /* this means that the analyzer used either added or consumed
- * (common for a stemmer) tokens, and we can't build a PrefixQuery */
- throw new ParseException("Cannot build PrefixQuery with analyzer "
- + getAnalyzer().getClass()
- + (tlist.size() > 1 ? " - token(s) added" : " - token consumed"));
- }
- }
-
- /**
- * Called when parser parses an input term token that has the fuzzy suffix (~) appended.
- * <p>
- * Depending on analyzer and settings, a fuzzy term may (most probably will)
- * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
- * <p>
- * Overrides super class, by passing terms through analyzer.
- *
- * @param field Name of the field query will use.
- * @param termStr Term token to use for building term for the query
- *
- * @return Resulting {@link Query} built for the term
- * @exception ParseException
- */
- @Override
- protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
- throws ParseException {
- // get Analyzer from superclass and tokenize the term
- TokenStream source = null;
- String nextToken = null;
- boolean multipleTokens = false;
-
- try {
- source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
- CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
- source.reset();
- if (source.incrementToken()) {
- nextToken = termAtt.toString();
- }
- multipleTokens = source.incrementToken();
- } catch (IOException e) {
- nextToken = null;
- }
-
- try {
- source.end();
- source.close();
- } catch (IOException e) {
- // ignore
- }
-
- if (multipleTokens) {
- throw new ParseException("Cannot build FuzzyQuery with analyzer " + getAnalyzer().getClass()
- + " - tokens were added");
- }
-
- return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken, minSimilarity);
- }
-
- /**
- * Overrides super class, by passing terms through analyzer.
- * @exception ParseException
- */
- @Override
- protected Query getRangeQuery(String field, String part1, String part2, boolean inclusive)
- throws ParseException {
- // get Analyzer from superclass and tokenize the terms
- TokenStream source = null;
- CharTermAttribute termAtt = null;
- boolean multipleTokens = false;
-
- if (part1 != null) {
- // part1
- try {
- source = getAnalyzer().reusableTokenStream(field, new StringReader(part1));
- termAtt = source.addAttribute(CharTermAttribute.class);
- source.reset();
- multipleTokens = false;
-
-
- if (source.incrementToken()) {
- part1 = termAtt.toString();
- }
- multipleTokens = source.incrementToken();
- } catch (IOException e) {
- // ignore
- }
-
- try {
- source.end();
- source.close();
- } catch (IOException e) {
- // ignore
- }
- if (multipleTokens) {
- throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
- + " - tokens were added to part1");
- }
- }
- try {
- source.close();
- } catch (IOException e) {
- // ignore
- }
- if (multipleTokens) {
- throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
- + " - tokens were added to part1");
- }
-
- if (part2 != null) {
- try {
- // part2
- source = getAnalyzer().reusableTokenStream(field, new StringReader(part2));
- termAtt = source.addAttribute(CharTermAttribute.class);
- source.reset();
- if (source.incrementToken()) {
- part2 = termAtt.toString();
- }
- multipleTokens = source.incrementToken();
- } catch (IOException e) {
- // ignore
- }
- try {
- source.end();
- source.close();
- } catch (IOException e) {
- // ignore
- }
- if (multipleTokens) {
- throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
- + " - tokens were added to part2");
- }
- }
- try {
- source.close();
- } catch (IOException e) {
- // ignore
- }
- if (multipleTokens) {
- throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
- + " - tokens were added to part2");
- }
- return super.getRangeQuery(field, part1, part2, inclusive);
- }
-
-}