1 package org.apache.lucene.queryParser.analyzing;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.StringReader;
22 import java.util.ArrayList;
23 import java.util.List;
25 import org.apache.lucene.analysis.Analyzer;
26 import org.apache.lucene.analysis.TokenStream;
27 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
28 import org.apache.lucene.queryParser.ParseException;
29 import org.apache.lucene.search.Query;
30 import org.apache.lucene.util.Version;
33 * Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
34 * are also passed through the given analyzer, but wild card characters (like <code>*</code>)
35 * don't get removed from the search terms.
37 * <p><b>Warning:</b> This class should only be used with analyzers that do not use stopwords
38 * or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer
39 * will turn <code>Häuser</code> into <code>hau</code>, but <code>H?user</code> will
40 * become <code>h?user</code> when using this parser and thus no match would be found (i.e.
41 * using this parser will be no improvement over QueryParser in such cases).
43 * @version $Revision$, $Date$
45 public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryParser {
48 * Constructs a query parser.
49 * @param field the default field for query terms.
50 * @param analyzer used to find terms in the query text.
52 public AnalyzingQueryParser(Version matchVersion, String field, Analyzer analyzer) {
53 super(matchVersion, field, analyzer);
58 * parses an input term token that contains one or more wildcard
59 * characters (like <code>*</code>), but is not a prefix term token (one
60 * that has just a single * character at the end).
62 * Example: will be called for <code>H?user</code> or for <code>H*user</code>
63 * but not for <code>*user</code>.
65 * Depending on analyzer and settings, a wildcard term may (most probably will)
66 * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
68 * Overrides super class, by passing terms through analyzer.
70 * @param field Name of the field query will use.
71 * @param termStr Term token that contains one or more wild card
72 * characters (? or *), but is not simple prefix term
74 * @return Resulting {@link Query} built for the term
75 * @throws ParseException
78 protected Query getWildcardQuery(String field, String termStr) throws ParseException {
79 List<String> tlist = new ArrayList<String>();
80 List<String> wlist = new ArrayList<String>();
81 /* somewhat a hack: find/store wildcard chars
82 * in order to put them back after analyzing */
83 boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*"));
84 StringBuilder tmpBuffer = new StringBuilder();
85 char[] chars = termStr.toCharArray();
86 for (int i = 0; i < termStr.length(); i++) {
87 if (chars[i] == '?' || chars[i] == '*') {
89 tlist.add(tmpBuffer.toString());
90 tmpBuffer.setLength(0);
92 isWithinToken = false;
95 wlist.add(tmpBuffer.toString());
96 tmpBuffer.setLength(0);
100 tmpBuffer.append(chars[i]);
103 tlist.add(tmpBuffer.toString());
105 wlist.add(tmpBuffer.toString());
108 // get Analyzer from superclass and tokenize the term
113 source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
115 } catch (IOException e1) {
116 throw new RuntimeException(e1);
118 CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
121 if (!source.incrementToken()) break;
122 } catch (IOException e) {
125 String term = termAtt.toString();
126 if (!"".equals(term)) {
128 tlist.set(countTokens++, term);
129 } catch (IndexOutOfBoundsException ioobe) {
137 } catch (IOException e) {
141 if (countTokens != tlist.size()) {
142 /* this means that the analyzer used either added or consumed
143 * (common for a stemmer) tokens, and we can't build a WildcardQuery */
144 throw new ParseException("Cannot build WildcardQuery with analyzer "
145 + getAnalyzer().getClass() + " - tokens added or lost");
148 if (tlist.size() == 0) {
150 } else if (tlist.size() == 1) {
151 if (wlist != null && wlist.size() == 1) {
152 /* if wlist contains one wildcard, it must be at the end, because:
153 * 1) wildcards are not allowed in 1st position of a term by QueryParser
154 * 2) if wildcard was *not* in end, there would be *two* or more tokens */
155 return super.getWildcardQuery(field, tlist.get(0)
156 + wlist.get(0).toString());
158 /* we should never get here! if so, this method was called
159 * with a termStr containing no wildcard ... */
160 throw new IllegalArgumentException("getWildcardQuery called without wildcard");
163 /* the term was tokenized, let's rebuild to one token
164 * with wildcards put back in postion */
165 StringBuilder sb = new StringBuilder();
166 for (int i = 0; i < tlist.size(); i++) {
167 sb.append( tlist.get(i));
168 if (wlist != null && wlist.size() > i) {
169 sb.append(wlist.get(i));
172 return super.getWildcardQuery(field, sb.toString());
177 * Called when parser parses an input term
178 * token that uses prefix notation; that is, contains a single '*' wildcard
179 * character as its last character. Since this is a special case
180 * of generic wildcard term, and such a query can be optimized easily,
181 * this usually results in a different query object.
183 * Depending on analyzer and settings, a prefix term may (most probably will)
184 * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
186 * Overrides super class, by passing terms through analyzer.
188 * @param field Name of the field query will use.
189 * @param termStr Term token to use for building term for the query
190 * (<b>without</b> trailing '*' character!)
192 * @return Resulting {@link Query} built for the term
193 * @throws ParseException
196 protected Query getPrefixQuery(String field, String termStr) throws ParseException {
197 // get Analyzer from superclass and tokenize the term
199 List<String> tlist = new ArrayList<String>();
201 source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
203 } catch (IOException e1) {
204 throw new RuntimeException(e1);
206 CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
209 if (!source.incrementToken()) break;
210 } catch (IOException e) {
213 tlist.add(termAtt.toString());
219 } catch (IOException e) {
223 if (tlist.size() == 1) {
224 return super.getPrefixQuery(field, tlist.get(0));
226 /* this means that the analyzer used either added or consumed
227 * (common for a stemmer) tokens, and we can't build a PrefixQuery */
228 throw new ParseException("Cannot build PrefixQuery with analyzer "
229 + getAnalyzer().getClass()
230 + (tlist.size() > 1 ? " - token(s) added" : " - token consumed"));
235 * Called when parser parses an input term token that has the fuzzy suffix (~) appended.
237 * Depending on analyzer and settings, a fuzzy term may (most probably will)
238 * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
240 * Overrides super class, by passing terms through analyzer.
242 * @param field Name of the field query will use.
243 * @param termStr Term token to use for building term for the query
245 * @return Resulting {@link Query} built for the term
246 * @exception ParseException
249 protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
250 throws ParseException {
251 // get Analyzer from superclass and tokenize the term
252 TokenStream source = null;
253 String nextToken = null;
254 boolean multipleTokens = false;
257 source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
258 CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
260 if (source.incrementToken()) {
261 nextToken = termAtt.toString();
263 multipleTokens = source.incrementToken();
264 } catch (IOException e) {
271 } catch (IOException e) {
275 if (multipleTokens) {
276 throw new ParseException("Cannot build FuzzyQuery with analyzer " + getAnalyzer().getClass()
277 + " - tokens were added");
280 return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken, minSimilarity);
284 * Overrides super class, by passing terms through analyzer.
285 * @exception ParseException
288 protected Query getRangeQuery(String field, String part1, String part2, boolean inclusive)
289 throws ParseException {
290 // get Analyzer from superclass and tokenize the terms
291 TokenStream source = null;
292 CharTermAttribute termAtt = null;
293 boolean multipleTokens = false;
298 source = getAnalyzer().reusableTokenStream(field, new StringReader(part1));
299 termAtt = source.addAttribute(CharTermAttribute.class);
301 multipleTokens = false;
304 if (source.incrementToken()) {
305 part1 = termAtt.toString();
307 multipleTokens = source.incrementToken();
308 } catch (IOException e) {
315 } catch (IOException e) {
318 if (multipleTokens) {
319 throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
320 + " - tokens were added to part1");
325 } catch (IOException e) {
328 if (multipleTokens) {
329 throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
330 + " - tokens were added to part1");
336 source = getAnalyzer().reusableTokenStream(field, new StringReader(part2));
337 termAtt = source.addAttribute(CharTermAttribute.class);
339 if (source.incrementToken()) {
340 part2 = termAtt.toString();
342 multipleTokens = source.incrementToken();
343 } catch (IOException e) {
349 } catch (IOException e) {
352 if (multipleTokens) {
353 throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
354 + " - tokens were added to part2");
359 } catch (IOException e) {
362 if (multipleTokens) {
363 throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
364 + " - tokens were added to part2");
366 return super.getRangeQuery(field, part1, part2, inclusive);