pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / queryparser / src / java / org / apache / lucene / queryParser / analyzing / AnalyzingQueryParser.java
diff --git a/lucene-java-3.5.0/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java b/lucene-java-3.5.0/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java

new file mode 100644 (file)

index 0000000..02e00f8
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java
@@ -0,0 +1,369 @@
+package org.apache.lucene.queryParser.analyzing;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.util.Version;
+
+/**
+ * Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
+ * are also passed through the given analyzer, but wild card characters (like <code>*</code>) 
+ * don't get removed from the search terms.
+ * 
+ * <p><b>Warning:</b> This class should only be used with analyzers that do not use stopwords
+ * or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer 
+ * will turn <code>H&auml;user</code> into <code>hau</code>, but <code>H?user</code> will 
+ * become <code>h?user</code> when using this parser and thus no match would be found (i.e.
+ * using this parser will be no improvement over QueryParser in such cases). 
+ *
+ * @version $Revision$, $Date$
+ */
+public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryParser {
+
+  /**
+   * Constructs a query parser.
+   * @param field    the default field for query terms.
+   * @param analyzer used to find terms in the query text.
+   */
+  public AnalyzingQueryParser(Version matchVersion, String field, Analyzer analyzer) {
+    super(matchVersion, field, analyzer);
+  }
+
+  /**
+   * Called when parser
+   * parses an input term token that contains one or more wildcard
+   * characters (like <code>*</code>), but is not a prefix term token (one
+   * that has just a single * character at the end).
+   * <p>
+   * Example: will be called for <code>H?user</code> or for <code>H*user</code> 
+   * but not for <code>*user</code>.
+   * <p>
+   * Depending on analyzer and settings, a wildcard term may (most probably will)
+   * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
+   * <p>
+   * Overrides super class, by passing terms through analyzer.
+   *
+   * @param  field   Name of the field query will use.
+   * @param  termStr Term token that contains one or more wild card
+   *                 characters (? or *), but is not simple prefix term
+   *
+   * @return Resulting {@link Query} built for the term
+   * @throws ParseException
+   */
+  @Override
+  protected Query getWildcardQuery(String field, String termStr) throws ParseException {
+    List<String> tlist = new ArrayList<String>();
+    List<String> wlist = new ArrayList<String>();
+    /* somewhat a hack: find/store wildcard chars
+     * in order to put them back after analyzing */
+    boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*"));
+    StringBuilder tmpBuffer = new StringBuilder();
+    char[] chars = termStr.toCharArray();
+    for (int i = 0; i < termStr.length(); i++) {
+      if (chars[i] == '?' || chars[i] == '*') {
+        if (isWithinToken) {
+          tlist.add(tmpBuffer.toString());
+          tmpBuffer.setLength(0);
+        }
+        isWithinToken = false;
+      } else {
+        if (!isWithinToken) {
+          wlist.add(tmpBuffer.toString());
+          tmpBuffer.setLength(0);
+        }
+        isWithinToken = true;
+      }
+      tmpBuffer.append(chars[i]);
+    }
+    if (isWithinToken) {
+      tlist.add(tmpBuffer.toString());
+    } else {
+      wlist.add(tmpBuffer.toString());
+    }
+
+    // get Analyzer from superclass and tokenize the term
+    TokenStream source;
+    
+    int countTokens = 0;
+    try {
+      source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
+      source.reset();
+    } catch (IOException e1) {
+      throw new RuntimeException(e1);
+    }
+    CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
+    while (true) {
+      try {
+        if (!source.incrementToken()) break;
+      } catch (IOException e) {
+        break;
+      }
+      String term = termAtt.toString();
+      if (!"".equals(term)) {
+        try {
+          tlist.set(countTokens++, term);
+        } catch (IndexOutOfBoundsException ioobe) {
+          countTokens = -1;
+        }
+      }
+    }
+    try {
+      source.end();
+      source.close();
+    } catch (IOException e) {
+      // ignore
+    }
+
+    if (countTokens != tlist.size()) {
+      /* this means that the analyzer used either added or consumed 
+       * (common for a stemmer) tokens, and we can't build a WildcardQuery */
+      throw new ParseException("Cannot build WildcardQuery with analyzer "
+          + getAnalyzer().getClass() + " - tokens added or lost");
+    }
+
+    if (tlist.size() == 0) {
+      return null;
+    } else if (tlist.size() == 1) {
+      if (wlist != null && wlist.size() == 1) {
+        /* if wlist contains one wildcard, it must be at the end, because:
+         * 1) wildcards are not allowed in 1st position of a term by QueryParser
+         * 2) if wildcard was *not* in end, there would be *two* or more tokens */
+        return super.getWildcardQuery(field, tlist.get(0)
+            + wlist.get(0).toString());
+      } else {
+        /* we should never get here! if so, this method was called
+         * with a termStr containing no wildcard ... */
+        throw new IllegalArgumentException("getWildcardQuery called without wildcard");
+      }
+    } else {
+      /* the term was tokenized, let's rebuild to one token
+       * with wildcards put back in postion */
+      StringBuilder sb = new StringBuilder();
+      for (int i = 0; i < tlist.size(); i++) {
+        sb.append( tlist.get(i));
+        if (wlist != null && wlist.size() > i) {
+          sb.append(wlist.get(i));
+        }
+      }
+      return super.getWildcardQuery(field, sb.toString());
+    }
+  }
+
+  /**
+   * Called when parser parses an input term
+   * token that uses prefix notation; that is, contains a single '*' wildcard
+   * character as its last character. Since this is a special case
+   * of generic wildcard term, and such a query can be optimized easily,
+   * this usually results in a different query object.
+   * <p>
+   * Depending on analyzer and settings, a prefix term may (most probably will)
+   * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
+   * <p>
+   * Overrides super class, by passing terms through analyzer.
+   *
+   * @param  field   Name of the field query will use.
+   * @param  termStr Term token to use for building term for the query
+   *                 (<b>without</b> trailing '*' character!)
+   *
+   * @return Resulting {@link Query} built for the term
+   * @throws ParseException
+   */
+  @Override
+  protected Query getPrefixQuery(String field, String termStr) throws ParseException {
+    // get Analyzer from superclass and tokenize the term
+    TokenStream source;
+    List<String> tlist = new ArrayList<String>();
+    try {
+      source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
+      source.reset();
+    } catch (IOException e1) {
+      throw new RuntimeException(e1);
+    }
+    CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
+    while (true) {
+      try {
+        if (!source.incrementToken()) break;
+      } catch (IOException e) {
+        break;
+      }
+      tlist.add(termAtt.toString());
+    }
+
+    try {
+      source.end();
+      source.close();
+    } catch (IOException e) {
+      // ignore
+    }
+
+    if (tlist.size() == 1) {
+      return super.getPrefixQuery(field, tlist.get(0));
+    } else {
+      /* this means that the analyzer used either added or consumed
+       * (common for a stemmer) tokens, and we can't build a PrefixQuery */
+      throw new ParseException("Cannot build PrefixQuery with analyzer "
+          + getAnalyzer().getClass()
+          + (tlist.size() > 1 ? " - token(s) added" : " - token consumed"));
+    }
+  }
+
+  /**
+   * Called when parser parses an input term token that has the fuzzy suffix (~) appended.
+   * <p>
+   * Depending on analyzer and settings, a fuzzy term may (most probably will)
+   * be lower-cased automatically. It <b>will</b> go through the default Analyzer.
+   * <p>
+   * Overrides super class, by passing terms through analyzer.
+   *
+   * @param field Name of the field query will use.
+   * @param termStr Term token to use for building term for the query
+   *
+   * @return Resulting {@link Query} built for the term
+   * @exception ParseException
+   */
+  @Override
+  protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
+      throws ParseException {
+    // get Analyzer from superclass and tokenize the term
+    TokenStream source = null;
+    String nextToken = null;
+    boolean multipleTokens = false;
+    
+    try {
+      source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
+      CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
+      source.reset();
+      if (source.incrementToken()) {
+        nextToken = termAtt.toString();
+      }
+      multipleTokens = source.incrementToken();
+    } catch (IOException e) {
+      nextToken = null;
+    }
+
+    try {
+      source.end();
+      source.close();
+    } catch (IOException e) {
+      // ignore
+    }
+
+    if (multipleTokens) {
+      throw new ParseException("Cannot build FuzzyQuery with analyzer " + getAnalyzer().getClass()
+          + " - tokens were added");
+    }
+
+    return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken, minSimilarity);
+  }
+
+  /**
+   * Overrides super class, by passing terms through analyzer.
+   * @exception ParseException
+   */
+  @Override
+  protected Query getRangeQuery(String field, String part1, String part2, boolean inclusive)
+      throws ParseException {
+    // get Analyzer from superclass and tokenize the terms
+    TokenStream source = null;
+    CharTermAttribute termAtt = null;
+    boolean multipleTokens = false;
+
+    if (part1 != null) {
+      // part1
+      try {
+        source = getAnalyzer().reusableTokenStream(field, new StringReader(part1));
+        termAtt = source.addAttribute(CharTermAttribute.class);
+        source.reset();
+        multipleTokens = false;
+
+
+        if (source.incrementToken()) {
+          part1 = termAtt.toString();
+        }
+        multipleTokens = source.incrementToken();
+      } catch (IOException e) {
+        // ignore
+      }
+
+      try {
+        source.end();
+        source.close();
+      } catch (IOException e) {
+        // ignore
+      }
+      if (multipleTokens) {
+        throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
+            + " - tokens were added to part1");
+      }
+    }
+    try {
+      source.close();
+    } catch (IOException e) {
+      // ignore
+    }
+    if (multipleTokens) {
+      throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
+          + " - tokens were added to part1");
+    }
+
+    if (part2 != null) {
+      try {
+        // part2
+        source = getAnalyzer().reusableTokenStream(field, new StringReader(part2));
+        termAtt = source.addAttribute(CharTermAttribute.class);
+        source.reset();
+        if (source.incrementToken()) {
+          part2 = termAtt.toString();
+        }
+        multipleTokens = source.incrementToken();
+      } catch (IOException e) {
+        // ignore
+      }
+      try {
+        source.end();
+        source.close();
+      } catch (IOException e) {
+        // ignore
+      }
+      if (multipleTokens) {
+        throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
+            + " - tokens were added to part2");
+      }
+    }
+    try {
+      source.close();
+    } catch (IOException e) {
+      // ignore
+    }
+    if (multipleTokens) {
+      throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
+          + " - tokens were added to part2");
+    }
+    return super.getRangeQuery(field, part1, part2, inclusive);
+  }
+
+}