pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / queryparser / src / java / org / apache / lucene / queryParser / standard / parser / StandardSyntaxParser.jj
diff --git a/lucene-java-3.5.0/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/StandardSyntaxParser.jj b/lucene-java-3.5.0/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/StandardSyntaxParser.jj

new file mode 100644 (file)

index 0000000..bec992a
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/parser/StandardSyntaxParser.jj
@@ -0,0 +1,472 @@
+/**
+ * Standard file is based on the TextParser.jj from lucene 2.3
+ */
+
+options {
+  STATIC=false;
+  JAVA_UNICODE_ESCAPE=true;
+  USER_CHAR_STREAM=false;
+  IGNORE_CASE=false;
+  JDK_VERSION="1.5";
+}
+
+PARSER_BEGIN(StandardSyntaxParser)
+package org.apache.lucene.queryParser.standard.parser;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Vector;
+
+import org.apache.lucene.messages.Message;
+import org.apache.lucene.messages.MessageImpl;
+import org.apache.lucene.queryParser.core.QueryNodeError;
+import org.apache.lucene.queryParser.core.QueryNodeException;
+import org.apache.lucene.queryParser.core.QueryNodeParseException;
+import org.apache.lucene.queryParser.core.messages.QueryParserMessages;
+import org.apache.lucene.queryParser.core.nodes.AndQueryNode;
+import org.apache.lucene.queryParser.core.nodes.BooleanQueryNode;
+import org.apache.lucene.queryParser.core.nodes.BoostQueryNode;
+import org.apache.lucene.queryParser.core.nodes.FieldQueryNode;
+import org.apache.lucene.queryParser.core.nodes.FuzzyQueryNode;
+import org.apache.lucene.queryParser.core.nodes.ModifierQueryNode;
+import org.apache.lucene.queryParser.core.nodes.GroupQueryNode;
+import org.apache.lucene.queryParser.core.nodes.OpaqueQueryNode;
+import org.apache.lucene.queryParser.core.nodes.OrQueryNode;
+import org.apache.lucene.queryParser.core.nodes.ParametricQueryNode;
+import org.apache.lucene.queryParser.core.nodes.ParametricRangeQueryNode;
+import org.apache.lucene.queryParser.core.nodes.SlopQueryNode;
+import org.apache.lucene.queryParser.core.nodes.ProximityQueryNode;
+import org.apache.lucene.queryParser.core.nodes.QueryNode;
+import org.apache.lucene.queryParser.core.nodes.QueryNodeImpl;
+import org.apache.lucene.queryParser.core.nodes.QuotedFieldQueryNode;
+import org.apache.lucene.queryParser.core.parser.SyntaxParser;
+
+public class StandardSyntaxParser implements SyntaxParser {
+
+       private static final int CONJ_NONE =0;
+       private static final int CONJ_AND =2;
+       private static final int CONJ_OR =2;
+
+ 
+   // syntax parser constructor
+   public StandardSyntaxParser() {
+       this(new StringReader(""));
+  }
+     /** Parses a query string, returning a {@link org.apache.lucene.queryParser.core.nodes.QueryNode}.
+     *  @param query  the query string to be parsed.
+     *  @throws ParseException if the parsing fails
+     */
+    public QueryNode parse(CharSequence query, CharSequence field) throws QueryNodeParseException {
+      ReInit(new StringReader(query.toString()));
+      try {
+        // TopLevelQuery is a Query followed by the end-of-input (EOF)
+        QueryNode querynode = TopLevelQuery(field);
+        return querynode;
+      }
+      catch (ParseException tme) {
+            tme.setQuery(query);
+            throw tme;
+      }
+      catch (Error tme) {
+          Message message = new MessageImpl(QueryParserMessages.INVALID_SYNTAX_CANNOT_PARSE, query, tme.getMessage());
+          QueryNodeParseException e = new QueryNodeParseException(tme);
+            e.setQuery(query);
+            e.setNonLocalizedMessage(message);
+            throw e;
+      }
+    }
+   
+}
+
+PARSER_END(StandardSyntaxParser)
+
+/* ***************** */
+/* Token Definitions */
+/* ***************** */
+
+<*> TOKEN : {
+  <#_NUM_CHAR:   ["0"-"9"] >
+// every character that follows a backslash is considered as an escaped character
+| <#_ESCAPED_CHAR: "\\" ~[] >
+| <#_TERM_START_CHAR: ( ~[ " ", "\t", "\n", "\r", "\u3000", "+", "-", "!", "(", ")", ":", "^",
+                           "[", "]", "\"", "{", "}", "~", "\\" ]
+                       | <_ESCAPED_CHAR> ) >
+| <#_TERM_CHAR: ( <_TERM_START_CHAR> | <_ESCAPED_CHAR> | "-" | "+" ) >
+| <#_WHITESPACE: ( " " | "\t" | "\n" | "\r" | "\u3000") >
+| <#_QUOTED_CHAR: ( ~[ "\"", "\\" ] | <_ESCAPED_CHAR> ) >
+}
+
+<DEFAULT, RangeIn, RangeEx> SKIP : {
+  < <_WHITESPACE>>
+}
+
+<DEFAULT> TOKEN : {
+  <AND:       ("AND" | "&&") >
+| <OR:        ("OR" | "||") >
+| <NOT:       ("NOT" | "!") >
+| <PLUS:      "+" >
+| <MINUS:     "-" >
+| <LPAREN:    "(" >
+| <RPAREN:    ")" >
+| <COLON:     ":" >
+| <CARAT:     "^" > : Boost
+| <QUOTED:     "\"" (<_QUOTED_CHAR>)* "\"">
+| <TERM:      <_TERM_START_CHAR> (<_TERM_CHAR>)*  >
+| <FUZZY_SLOP:     "~" ( (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? )? >
+| <RANGEIN_START: "[" > : RangeIn
+| <RANGEEX_START: "{" > : RangeEx
+}
+
+<Boost> TOKEN : {
+<NUMBER:    (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT
+}
+
+<RangeIn> TOKEN : {
+<RANGEIN_TO: "TO">
+| <RANGEIN_END: "]"> : DEFAULT
+| <RANGEIN_QUOTED: "\"" (~["\""] | "\\\"")+ "\"">
+| <RANGEIN_GOOP: (~[ " ", "]" ])+ >
+}
+
+<RangeEx> TOKEN : {
+<RANGEEX_TO: "TO">
+| <RANGEEX_END: "}"> : DEFAULT
+| <RANGEEX_QUOTED: "\"" (~["\""] | "\\\"")+ "\"">
+| <RANGEEX_GOOP: (~[ " ", "}" ])+ >
+}
+
+// *   Query  ::= ( Clause )*
+// *   Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
+
+int Conjunction() : {
+  int ret = CONJ_NONE;
+}
+{
+  [
+    <AND> { ret = CONJ_AND; }
+    | <OR>  { ret = CONJ_OR; }
+  ]
+  { return ret; }
+}
+
+ModifierQueryNode.Modifier Modifiers() : {
+  ModifierQueryNode.Modifier ret = ModifierQueryNode.Modifier.MOD_NONE;
+}
+{
+  [
+     <PLUS> { ret = ModifierQueryNode.Modifier.MOD_REQ; }
+     | <MINUS> { ret = ModifierQueryNode.Modifier.MOD_NOT; }
+     | <NOT> { ret = ModifierQueryNode.Modifier.MOD_NOT; }
+  ]
+  { return ret; }
+}
+
+// This makes sure that there is no garbage after the query string
+QueryNode TopLevelQuery(CharSequence field) : 
+{
+       QueryNode q;
+}
+{
+       q=Query(field) <EOF>
+       {
+               return q;
+       }
+}
+
+// These changes were made to introduce operator precedence:
+// - Clause() now returns a QueryNode. 
+// - The modifiers are consumed by Clause() and returned as part of the QueryNode Object
+// - Query does not consume conjunctions (AND, OR) anymore. 
+// - This is now done by two new non-terminals: ConjClause and DisjClause
+// The parse tree looks similar to this:
+//       Query ::= DisjQuery ( DisjQuery )*
+//   DisjQuery ::= ConjQuery ( OR ConjQuery )* 
+//   ConjQuery ::= Clause ( AND Clause )*
+//      Clause ::= [ Modifier ] ... 
+
+
+QueryNode Query(CharSequence field) :
+{
+  Vector<QueryNode> clauses = null;
+  QueryNode c, first=null;
+}
+{
+  first=DisjQuery(field)
+  (
+    c=DisjQuery(field)
+    { 
+            if (clauses == null) {
+                clauses = new Vector<QueryNode>();
+                clauses.addElement(first); 
+            } 
+        clauses.addElement(c);
+    }
+    )*
+    {
+        if (clauses != null) { 
+               return new BooleanQueryNode(clauses);
+       } else {
+               return first;
+       }
+    }
+}
+
+QueryNode DisjQuery(CharSequence field) : {
+       QueryNode first, c;
+       Vector<QueryNode> clauses = null;
+}
+{
+  first = ConjQuery(field)
+  (
+   <OR> c=ConjQuery(field)
+   { 
+     if (clauses == null) {
+         clauses = new Vector<QueryNode>();
+         clauses.addElement(first); 
+     } 
+     clauses.addElement(c);
+   }
+  )*
+  {
+    if (clauses != null) { 
+           return new OrQueryNode(clauses);
+    } else {
+        return first;
+    }
+  }
+}
+
+QueryNode ConjQuery(CharSequence field) : {
+       QueryNode first, c;
+       Vector<QueryNode> clauses = null;
+}
+{
+  first = ModClause(field)
+  (
+   <AND> c=ModClause(field)
+   { 
+     if (clauses == null) {
+         clauses = new Vector<QueryNode>();
+         clauses.addElement(first);
+     } 
+     clauses.addElement(c); 
+   }
+  )*
+  {
+    if (clauses != null) {     
+           return new AndQueryNode(clauses);
+    } else {
+        return first;
+    }
+  }
+}
+
+// QueryNode Query(CharSequence field) :
+// {
+// List clauses = new ArrayList();
+//   List modifiers = new ArrayList();
+//   QueryNode q, firstQuery=null;
+//   ModifierQueryNode.Modifier mods;
+//   int conj;
+// }
+// {
+//   mods=Modifiers() q=Clause(field)
+//   {
+//     if (mods == ModifierQueryNode.Modifier.MOD_NONE) firstQuery=q;
+//     
+//     // do not create modifier nodes with MOD_NONE
+//     if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
+//             q = new ModifierQueryNode(q, mods);
+//     }
+//     clauses.add(q);
+//   }
+//   (
+//     conj=Conjunction() mods=Modifiers() q=Clause(field)
+//     { 
+//         // do not create modifier nodes with MOD_NONE
+//             if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
+//                     q = new ModifierQueryNode(q, mods);
+//             }
+//             clauses.add(q);
+//             //TODO: figure out what to do with AND and ORs
+//   }
+//   )*
+//     {
+//      if (clauses.size() == 1 && firstQuery != null)
+//         return firstQuery;
+//       else {
+//             return new BooleanQueryNode(clauses);
+//       }
+//     }
+// }
+
+QueryNode ModClause(CharSequence field) : {
+  QueryNode q; 
+  ModifierQueryNode.Modifier mods;
+}
+{
+   mods=Modifiers() q= Clause(field) {
+               if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
+                       q = new ModifierQueryNode(q, mods);
+               }
+               return q;
+   }
+}
+
+QueryNode Clause(CharSequence field) : {
+  QueryNode q;
+  Token fieldToken=null, boost=null;
+  boolean group = false;
+}
+{
+  [
+    LOOKAHEAD(2)
+    (
+    fieldToken=<TERM> <COLON> {field=EscapeQuerySyntaxImpl.discardEscapeChar(fieldToken.image);}    
+    )
+  ]
+
+  (
+   q=Term(field)
+   | <LPAREN> q=Query(field) <RPAREN> (<CARAT> boost=<NUMBER>)? {group=true;}
+
+  )
+    {
+      if (boost != null) {
+                 float f = (float)1.0;
+                 try {
+                   f = Float.valueOf(boost.image).floatValue();
+                   // avoid boosting null queries, such as those caused by stop words
+               if (q != null) {
+                       q = new BoostQueryNode(q, f);
+               }
+                 } catch (Exception ignored) {
+                       /* Should this be handled somehow? (defaults to "no boost", if
+             * boost number is invalid)
+             */                  
+                 }
+      }
+      if (group) { q = new GroupQueryNode(q);}
+      return q;
+    }
+}
+
+
+QueryNode Term(CharSequence field) : {
+  Token term, boost=null, fuzzySlop=null, goop1, goop2;
+  boolean fuzzy = false;
+  QueryNode q =null; 
+  ParametricQueryNode qLower, qUpper;
+  float defaultMinSimilarity = 0.5f;
+}
+{
+  (
+     (
+          term=<TERM> { q = new FieldQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), term.beginColumn, term.endColumn); }
+       | term=<NUMBER>
+     )
+     [ fuzzySlop=<FUZZY_SLOP> { fuzzy=true; } ]
+     [ <CARAT> boost=<NUMBER> [ fuzzySlop=<FUZZY_SLOP> { fuzzy=true; } ] ]
+     {
+       if (fuzzy) {
+                 float fms = defaultMinSimilarity;
+                 try {
+            fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue();
+                 } catch (Exception ignored) { }
+                if(fms < 0.0f || fms > 1.0f){
+                  throw new ParseException(new MessageImpl(QueryParserMessages.INVALID_SYNTAX_FUZZY_LIMITS));
+                }
+                q = new FuzzyQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), fms, term.beginColumn, term.endColumn);
+       }
+     }
+     | ( <RANGEIN_START> ( goop1=<RANGEIN_GOOP>|goop1=<RANGEIN_QUOTED> )
+         [ <RANGEIN_TO> ] ( goop2=<RANGEIN_GOOP>|goop2=<RANGEIN_QUOTED> )
+         <RANGEIN_END> )
+       [ <CARAT> boost=<NUMBER> ]
+        {
+          if (goop1.kind == RANGEIN_QUOTED) {
+            goop1.image = goop1.image.substring(1, goop1.image.length()-1);
+          }
+          if (goop2.kind == RANGEIN_QUOTED) {
+            goop2.image = goop2.image.substring(1, goop2.image.length()-1);
+          }
+          
+          qLower = new ParametricQueryNode(field, ParametricQueryNode.CompareOperator.GE, 
+                                              EscapeQuerySyntaxImpl.discardEscapeChar(goop1.image), goop1.beginColumn, goop1.endColumn);
+                 qUpper = new ParametricQueryNode(field, ParametricQueryNode.CompareOperator.LE, 
+                                              EscapeQuerySyntaxImpl.discardEscapeChar(goop2.image), goop2.beginColumn, goop2.endColumn);
+          q = new ParametricRangeQueryNode(qLower, qUpper);
+        }
+     | ( <RANGEEX_START> ( goop1=<RANGEEX_GOOP>|goop1=<RANGEEX_QUOTED> )
+         [ <RANGEEX_TO> ] ( goop2=<RANGEEX_GOOP>|goop2=<RANGEEX_QUOTED> )
+         <RANGEEX_END> )
+       [ <CARAT> boost=<NUMBER> ]
+        {
+          if (goop1.kind == RANGEEX_QUOTED) {
+            goop1.image = goop1.image.substring(1, goop1.image.length()-1);
+          }
+          if (goop2.kind == RANGEEX_QUOTED) {
+            goop2.image = goop2.image.substring(1, goop2.image.length()-1);
+          }
+          qLower = new ParametricQueryNode(field, ParametricQueryNode.CompareOperator.GT, 
+                                              EscapeQuerySyntaxImpl.discardEscapeChar(goop1.image), goop1.beginColumn, goop1.endColumn);
+                 qUpper = new ParametricQueryNode(field, ParametricQueryNode.CompareOperator.LT, 
+                                              EscapeQuerySyntaxImpl.discardEscapeChar(goop2.image), goop2.beginColumn, goop2.endColumn);
+          q = new ParametricRangeQueryNode(qLower, qUpper);              
+        }
+     | term=<QUOTED> {q = new QuotedFieldQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image.substring(1, term.image.length()-1)), term.beginColumn + 1, term.endColumn - 1);}
+       [ fuzzySlop=<FUZZY_SLOP> ]
+       [ <CARAT> boost=<NUMBER> ]
+       {       
+         int phraseSlop = 0;
+
+         if (fuzzySlop != null) {
+           try {
+             phraseSlop = Float.valueOf(fuzzySlop.image.substring(1)).intValue();
+             q = new SlopQueryNode(q, phraseSlop);    
+           }
+           catch (Exception ignored) {
+            /* Should this be handled somehow? (defaults to "no PhraseSlop", if
+                * slop number is invalid)
+                */             
+           }
+         }
+              
+       }
+  )
+  {
+         if (boost != null) {
+                 float f = (float)1.0;
+                 try {
+                   f = Float.valueOf(boost.image).floatValue();
+                   // avoid boosting null queries, such as those caused by stop words
+               if (q != null) {
+                       q = new BoostQueryNode(q, f);
+               }
+                 } catch (Exception ignored) {
+                       /* Should this be handled somehow? (defaults to "no boost", if
+                * boost number is invalid)
+                */               
+                 }
+         }
+      return q;
+  }
+}