lucene-java-3.4.0/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java

   1 package org.apache.lucene.queryParser.analyzing;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.StringReader;
  22 import java.util.ArrayList;
  23 import java.util.List;
  24
  25 import org.apache.lucene.analysis.Analyzer;
  26 import org.apache.lucene.analysis.TokenStream;
  27 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  28 import org.apache.lucene.queryParser.ParseException;
  29 import org.apache.lucene.search.Query;
  30 import org.apache.lucene.util.Version;
  31
  32 /**
  33  * Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
  34  * are also passed through the given analyzer, but wild card characters (like <code>*</code>)
  35  * don't get removed from the search terms.
  36  *
  37  * <p><b>Warning:</b> This class should only be used with analyzers that do not use stopwords
  38  * or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer
  39  * will turn <code>H&auml;user</code> into <code>hau</code>, but <code>H?user</code> will
  40  * become <code>h?user</code> when using this parser and thus no match would be found (i.e.
  41  * using this parser will be no improvement over QueryParser in such cases).
  42  *
  43  * @version $Revision$, $Date$
  44  */
  45 public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryParser {
  46
  47   /**
  48    * Constructs a query parser.
  49    * @param field    the default field for query terms.
  50    * @param analyzer used to find terms in the query text.
  51    */
  52   public AnalyzingQueryParser(Version matchVersion, String field, Analyzer analyzer) {
  53     super(matchVersion, field, analyzer);
  54   }
  55
  56   /**
  57    * Called when parser
  58    * parses an input term token that contains one or more wildcard
  59    * characters (like <code>*</code>), but is not a prefix term token (one
  60    * that has just a single * character at the end).
  61    * <p>
  62    * Example: will be called for <code>H?user</code> or for <code>H*user</code>
  63    * but not for <code>*user</code>.
  64    * <p>
  65    * Depending on analyzer and settings, a wildcard term may (most probably will)
  66    * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
  67    * <p>
  68    * Overrides super class, by passing terms through analyzer.
  69    *
  70    * @param  field   Name of the field query will use.
  71    * @param  termStr Term token that contains one or more wild card
  72    *                 characters (? or *), but is not simple prefix term
  73    *
  74    * @return Resulting {@link Query} built for the term
  75    * @throws ParseException
  76    */
  77   @Override
  78   protected Query getWildcardQuery(String field, String termStr) throws ParseException {
  79     List<String> tlist = new ArrayList<String>();
  80     List<String> wlist = new ArrayList<String>();
  81     /* somewhat a hack: find/store wildcard chars
  82      * in order to put them back after analyzing */
  83     boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*"));
  84     StringBuilder tmpBuffer = new StringBuilder();
  85     char[] chars = termStr.toCharArray();
  86     for (int i = 0; i < termStr.length(); i++) {
  87       if (chars[i] == '?' || chars[i] == '*') {
  88         if (isWithinToken) {
  89           tlist.add(tmpBuffer.toString());
  90           tmpBuffer.setLength(0);
  91         }
  92         isWithinToken = false;
  93       } else {
  94         if (!isWithinToken) {
  95           wlist.add(tmpBuffer.toString());
  96           tmpBuffer.setLength(0);
  97         }
  98         isWithinToken = true;
  99       }
 100       tmpBuffer.append(chars[i]);
 101     }
 102     if (isWithinToken) {
 103       tlist.add(tmpBuffer.toString());
 104     } else {
 105       wlist.add(tmpBuffer.toString());
 106     }
 107
 108     // get Analyzer from superclass and tokenize the term
 109     TokenStream source;
 110
 111     int countTokens = 0;
 112     try {
 113       source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
 114       source.reset();
 115     } catch (IOException e1) {
 116       throw new RuntimeException(e1);
 117     }
 118     CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
 119     while (true) {
 120       try {
 121         if (!source.incrementToken()) break;
 122       } catch (IOException e) {
 123         break;
 124       }
 125       String term = termAtt.toString();
 126       if (!"".equals(term)) {
 127         try {
 128           tlist.set(countTokens++, term);
 129         } catch (IndexOutOfBoundsException ioobe) {
 130           countTokens = -1;
 131         }
 132       }
 133     }
 134     try {
 135       source.end();
 136       source.close();
 137     } catch (IOException e) {
 138       // ignore
 139     }
 140
 141     if (countTokens != tlist.size()) {
 142       /* this means that the analyzer used either added or consumed
 143        * (common for a stemmer) tokens, and we can't build a WildcardQuery */
 144       throw new ParseException("Cannot build WildcardQuery with analyzer "
 145           + getAnalyzer().getClass() + " - tokens added or lost");
 146     }
 147
 148     if (tlist.size() == 0) {
 149       return null;
 150     } else if (tlist.size() == 1) {
 151       if (wlist != null && wlist.size() == 1) {
 152         /* if wlist contains one wildcard, it must be at the end, because:
 153          * 1) wildcards are not allowed in 1st position of a term by QueryParser
 154          * 2) if wildcard was *not* in end, there would be *two* or more tokens */
 155         return super.getWildcardQuery(field, tlist.get(0)
 156             + wlist.get(0).toString());
 157       } else {
 158         /* we should never get here! if so, this method was called
 159          * with a termStr containing no wildcard ... */
 160         throw new IllegalArgumentException("getWildcardQuery called without wildcard");
 161       }
 162     } else {
 163       /* the term was tokenized, let's rebuild to one token
 164        * with wildcards put back in postion */
 165       StringBuilder sb = new StringBuilder();
 166       for (int i = 0; i < tlist.size(); i++) {
 167         sb.append( tlist.get(i));
 168         if (wlist != null && wlist.size() > i) {
 169           sb.append(wlist.get(i));
 170         }
 171       }
 172       return super.getWildcardQuery(field, sb.toString());
 173     }
 174   }
 175
 176   /**
 177    * Called when parser parses an input term
 178    * token that uses prefix notation; that is, contains a single '*' wildcard
 179    * character as its last character. Since this is a special case
 180    * of generic wildcard term, and such a query can be optimized easily,
 181    * this usually results in a different query object.
 182    * <p>
 183    * Depending on analyzer and settings, a prefix term may (most probably will)
 184    * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
 185    * <p>
 186    * Overrides super class, by passing terms through analyzer.
 187    *
 188    * @param  field   Name of the field query will use.
 189    * @param  termStr Term token to use for building term for the query
 190    *                 (<b>without</b> trailing '*' character!)
 191    *
 192    * @return Resulting {@link Query} built for the term
 193    * @throws ParseException
 194    */
 195   @Override
 196   protected Query getPrefixQuery(String field, String termStr) throws ParseException {
 197     // get Analyzer from superclass and tokenize the term
 198     TokenStream source;
 199     List<String> tlist = new ArrayList<String>();
 200     try {
 201       source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
 202       source.reset();
 203     } catch (IOException e1) {
 204       throw new RuntimeException(e1);
 205     }
 206     CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
 207     while (true) {
 208       try {
 209         if (!source.incrementToken()) break;
 210       } catch (IOException e) {
 211         break;
 212       }
 213       tlist.add(termAtt.toString());
 214     }
 215
 216     try {
 217       source.end();
 218       source.close();
 219     } catch (IOException e) {
 220       // ignore
 221     }
 222
 223     if (tlist.size() == 1) {
 224       return super.getPrefixQuery(field, tlist.get(0));
 225     } else {
 226       /* this means that the analyzer used either added or consumed
 227        * (common for a stemmer) tokens, and we can't build a PrefixQuery */
 228       throw new ParseException("Cannot build PrefixQuery with analyzer "
 229           + getAnalyzer().getClass()
 230           + (tlist.size() > 1 ? " - token(s) added" : " - token consumed"));
 231     }
 232   }
 233
 234   /**
 235    * Called when parser parses an input term token that has the fuzzy suffix (~) appended.
 236    * <p>
 237    * Depending on analyzer and settings, a fuzzy term may (most probably will)
 238    * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
 239    * <p>
 240    * Overrides super class, by passing terms through analyzer.
 241    *
 242    * @param field Name of the field query will use.
 243    * @param termStr Term token to use for building term for the query
 244    *
 245    * @return Resulting {@link Query} built for the term
 246    * @exception ParseException
 247    */
 248   @Override
 249   protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
 250       throws ParseException {
 251     // get Analyzer from superclass and tokenize the term
 252     TokenStream source = null;
 253     String nextToken = null;
 254     boolean multipleTokens = false;
 255
 256     try {
 257       source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
 258       CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
 259       source.reset();
 260       if (source.incrementToken()) {
 261         nextToken = termAtt.toString();
 262       }
 263       multipleTokens = source.incrementToken();
 264     } catch (IOException e) {
 265       nextToken = null;
 266     }
 267
 268     try {
 269       source.end();
 270       source.close();
 271     } catch (IOException e) {
 272       // ignore
 273     }
 274
 275     if (multipleTokens) {
 276       throw new ParseException("Cannot build FuzzyQuery with analyzer " + getAnalyzer().getClass()
 277           + " - tokens were added");
 278     }
 279
 280     return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken, minSimilarity);
 281   }
 282
 283   /**
 284    * Overrides super class, by passing terms through analyzer.
 285    * @exception ParseException
 286    */
 287   @Override
 288   protected Query getRangeQuery(String field, String part1, String part2, boolean inclusive)
 289       throws ParseException {
 290     // get Analyzer from superclass and tokenize the terms
 291     TokenStream source = null;
 292     CharTermAttribute termAtt = null;
 293     boolean multipleTokens = false;
 294
 295     if (part1 != null) {
 296       // part1
 297       try {
 298         source = getAnalyzer().reusableTokenStream(field, new StringReader(part1));
 299         termAtt = source.addAttribute(CharTermAttribute.class);
 300         source.reset();
 301         multipleTokens = false;
 302
 303
 304         if (source.incrementToken()) {
 305           part1 = termAtt.toString();
 306         }
 307         multipleTokens = source.incrementToken();
 308       } catch (IOException e) {
 309         // ignore
 310       }
 311
 312       try {
 313         source.end();
 314         source.close();
 315       } catch (IOException e) {
 316         // ignore
 317       }
 318       if (multipleTokens) {
 319         throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
 320             + " - tokens were added to part1");
 321       }
 322     }
 323     try {
 324       source.close();
 325     } catch (IOException e) {
 326       // ignore
 327     }
 328     if (multipleTokens) {
 329       throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
 330           + " - tokens were added to part1");
 331     }
 332
 333     if (part2 != null) {
 334       try {
 335         // part2
 336         source = getAnalyzer().reusableTokenStream(field, new StringReader(part2));
 337         termAtt = source.addAttribute(CharTermAttribute.class);
 338         source.reset();
 339         if (source.incrementToken()) {
 340           part2 = termAtt.toString();
 341         }
 342         multipleTokens = source.incrementToken();
 343       } catch (IOException e) {
 344         // ignore
 345       }
 346       try {
 347         source.end();
 348         source.close();
 349       } catch (IOException e) {
 350         // ignore
 351       }
 352       if (multipleTokens) {
 353         throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
 354             + " - tokens were added to part2");
 355       }
 356     }
 357     try {
 358       source.close();
 359     } catch (IOException e) {
 360       // ignore
 361     }
 362     if (multipleTokens) {
 363       throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
 364           + " - tokens were added to part2");
 365     }
 366     return super.getRangeQuery(field, part1, part2, inclusive);
 367   }
 368
 369 }