1 package org.apache.lucene.queryParser.complexPhrase;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Iterator;
23 import java.util.List;
25 import org.apache.lucene.analysis.Analyzer;
26 import org.apache.lucene.index.IndexReader;
27 import org.apache.lucene.index.Term;
28 import org.apache.lucene.queryParser.ParseException;
29 import org.apache.lucene.queryParser.QueryParser;
30 import org.apache.lucene.search.BooleanClause;
31 import org.apache.lucene.search.BooleanQuery;
32 import org.apache.lucene.search.MultiTermQuery;
33 import org.apache.lucene.search.Query;
34 import org.apache.lucene.search.TermQuery;
35 import org.apache.lucene.search.TermRangeQuery;
36 import org.apache.lucene.search.spans.SpanNearQuery;
37 import org.apache.lucene.search.spans.SpanNotQuery;
38 import org.apache.lucene.search.spans.SpanOrQuery;
39 import org.apache.lucene.search.spans.SpanQuery;
40 import org.apache.lucene.search.spans.SpanTermQuery;
41 import org.apache.lucene.util.Version;
44 * QueryParser which permits complex phrase query syntax eg "(john jon
45 * jonathan~) peters*".
47 * Performs potentially multiple passes over Query text to parse any nested
48 * logic in PhraseQueries. - First pass takes any PhraseQuery content between
49 * quotes and stores for subsequent pass. All other query content is parsed as
50 * normal - Second pass parses any stored PhraseQuery content, checking all
51 * embedded clauses are referring to the same field and therefore can be
52 * rewritten as Span queries. All PhraseQuery clauses are expressed as
53 * ComplexPhraseQuery objects
56 * This could arguably be done in one pass using a new QueryParser but here I am
57 * working within the constraints of the existing parser as a base class. This
58 * currently simply feeds all phrase content through an analyzer to select
59 * phrase terms - any "special" syntax such as * ~ * etc are not given special
64 public class ComplexPhraseQueryParser extends QueryParser {
65 private ArrayList<ComplexPhraseQuery> complexPhrases = null;
67 private boolean isPass2ResolvingPhrases;
69 private ComplexPhraseQuery currentPhraseQuery = null;
71 public ComplexPhraseQueryParser(Version matchVersion, String f, Analyzer a) {
72 super(matchVersion, f, a);
76 protected Query getFieldQuery(String field, String queryText, int slop) {
77 ComplexPhraseQuery cpq = new ComplexPhraseQuery(field, queryText, slop);
78 complexPhrases.add(cpq); // add to list of phrases to be parsed once
80 // are through with this pass
85 public Query parse(String query) throws ParseException {
86 if (isPass2ResolvingPhrases) {
87 MultiTermQuery.RewriteMethod oldMethod = getMultiTermRewriteMethod();
89 // Temporarily force BooleanQuery rewrite so that Parser will
91 // collection of terms which we can convert into SpanQueries.
92 // ConstantScoreRewrite mode produces an
93 // opaque ConstantScoreQuery object which cannot be interrogated for
94 // terms in the same way a BooleanQuery can.
95 // QueryParser is not guaranteed threadsafe anyway so this temporary
96 // state change should not
98 setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
99 return super.parse(query);
101 setMultiTermRewriteMethod(oldMethod);
105 // First pass - parse the top-level query recording any PhraseQuerys
106 // which will need to be resolved
107 complexPhrases = new ArrayList<ComplexPhraseQuery>();
108 Query q = super.parse(query);
110 // Perform second pass, using this QueryParser to parse any nested
111 // PhraseQueries with different
112 // set of syntax restrictions (i.e. all fields must be same)
113 isPass2ResolvingPhrases = true;
115 for (Iterator<ComplexPhraseQuery> iterator = complexPhrases.iterator(); iterator.hasNext();) {
116 currentPhraseQuery = iterator.next();
117 // in each phrase, now parse the contents between quotes as a
118 // separate parse operation
119 currentPhraseQuery.parsePhraseElements(this);
122 isPass2ResolvingPhrases = false;
127 // There is No "getTermQuery throws ParseException" method to override so
128 // unfortunately need
129 // to throw a runtime exception here if a term for another field is embedded
132 protected Query newTermQuery(Term term) {
133 if (isPass2ResolvingPhrases) {
135 checkPhraseClauseIsForSameField(term.field());
136 } catch (ParseException pe) {
137 throw new RuntimeException("Error parsing complex phrase", pe);
140 return super.newTermQuery(term);
143 // Helper method used to report on any clauses that appear in query syntax
144 private void checkPhraseClauseIsForSameField(String field)
145 throws ParseException {
146 if (!field.equals(currentPhraseQuery.field)) {
147 throw new ParseException("Cannot have clause for field \"" + field
148 + "\" nested in phrase " + " for field \"" + currentPhraseQuery.field
154 protected Query getWildcardQuery(String field, String termStr)
155 throws ParseException {
156 if (isPass2ResolvingPhrases) {
157 checkPhraseClauseIsForSameField(field);
159 return super.getWildcardQuery(field, termStr);
163 protected Query getRangeQuery(String field, String part1, String part2,
164 boolean inclusive) throws ParseException {
165 if (isPass2ResolvingPhrases) {
166 checkPhraseClauseIsForSameField(field);
168 return super.getRangeQuery(field, part1, part2, inclusive);
172 protected Query newRangeQuery(String field, String part1, String part2,
174 if (isPass2ResolvingPhrases) {
175 // Must use old-style RangeQuery in order to produce a BooleanQuery
176 // that can be turned into SpanOr clause
177 TermRangeQuery rangeQuery = new TermRangeQuery(field, part1, part2, inclusive, inclusive,
179 rangeQuery.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
182 return super.newRangeQuery(field, part1, part2, inclusive);
186 protected Query getFuzzyQuery(String field, String termStr,
187 float minSimilarity) throws ParseException {
188 if (isPass2ResolvingPhrases) {
189 checkPhraseClauseIsForSameField(field);
191 return super.getFuzzyQuery(field, termStr, minSimilarity);
195 * Used to handle the query content in between quotes and produced Span-based
196 * interpretations of the clauses.
198 static class ComplexPhraseQuery extends Query {
202 String phrasedQueryStringContents;
206 private Query contents;
208 public ComplexPhraseQuery(String field, String phrasedQueryStringContents,
212 this.phrasedQueryStringContents = phrasedQueryStringContents;
213 this.slopFactor = slopFactor;
216 // Called by ComplexPhraseQueryParser for each phrase after the main
219 protected void parsePhraseElements(QueryParser qp) throws ParseException {
220 // TODO ensure that field-sensitivity is preserved ie the query
221 // string below is parsed as
222 // field+":("+phrasedQueryStringContents+")"
223 // but this will need code in rewrite to unwrap the first layer of
225 contents = qp.parse(phrasedQueryStringContents);
229 public Query rewrite(IndexReader reader) throws IOException {
230 // ArrayList spanClauses = new ArrayList();
231 if (contents instanceof TermQuery) {
234 // Build a sequence of Span clauses arranged in a SpanNear - child
235 // clauses can be complex
236 // Booleans e.g. nots and ors etc
237 int numNegatives = 0;
238 if (!(contents instanceof BooleanQuery)) {
239 throw new IllegalArgumentException("Unknown query type \""
240 + contents.getClass().getName()
241 + "\" found in phrase query string \"" + phrasedQueryStringContents
244 BooleanQuery bq = (BooleanQuery) contents;
245 BooleanClause[] bclauses = bq.getClauses();
246 SpanQuery[] allSpanClauses = new SpanQuery[bclauses.length];
247 // For all clauses e.g. one* two~
248 for (int i = 0; i < bclauses.length; i++) {
249 // HashSet bclauseterms=new HashSet();
250 Query qc = bclauses[i].getQuery();
251 // Rewrite this clause e.g one* becomes (one OR onerous)
252 qc = qc.rewrite(reader);
253 if (bclauses[i].getOccur().equals(BooleanClause.Occur.MUST_NOT)) {
257 if (qc instanceof BooleanQuery) {
258 ArrayList<SpanQuery> sc = new ArrayList<SpanQuery>();
259 addComplexPhraseClause(sc, (BooleanQuery) qc);
261 allSpanClauses[i] = sc.get(0);
263 // Insert fake term e.g. phrase query was for "Fred Smithe*" and
264 // there were no "Smithe*" terms - need to
265 // prevent match on just "Fred".
266 allSpanClauses[i] = new SpanTermQuery(new Term(field,
267 "Dummy clause because no terms found - must match nothing"));
270 if (qc instanceof TermQuery) {
271 TermQuery tq = (TermQuery) qc;
272 allSpanClauses[i] = new SpanTermQuery(tq.getTerm());
274 throw new IllegalArgumentException("Unknown query type \""
275 + qc.getClass().getName()
276 + "\" found in phrase query string \""
277 + phrasedQueryStringContents + "\"");
282 if (numNegatives == 0) {
283 // The simple case - no negative elements in phrase
284 return new SpanNearQuery(allSpanClauses, slopFactor, true);
286 // Complex case - we have mixed positives and negatives in the
288 // Need to return a SpanNotQuery
289 ArrayList<SpanQuery> positiveClauses = new ArrayList<SpanQuery>();
290 for (int j = 0; j < allSpanClauses.length; j++) {
291 if (!bclauses[j].getOccur().equals(BooleanClause.Occur.MUST_NOT)) {
292 positiveClauses.add(allSpanClauses[j]);
296 SpanQuery[] includeClauses = positiveClauses
297 .toArray(new SpanQuery[positiveClauses.size()]);
299 SpanQuery include = null;
300 if (includeClauses.length == 1) {
301 include = includeClauses[0]; // only one positive clause
303 // need to increase slop factor based on gaps introduced by
305 include = new SpanNearQuery(includeClauses, slopFactor + numNegatives,
308 // Use sequence of positive and negative values as the exclude.
309 SpanNearQuery exclude = new SpanNearQuery(allSpanClauses, slopFactor,
311 SpanNotQuery snot = new SpanNotQuery(include, exclude);
315 private void addComplexPhraseClause(List<SpanQuery> spanClauses, BooleanQuery qc) {
316 ArrayList<SpanQuery> ors = new ArrayList<SpanQuery>();
317 ArrayList<SpanQuery> nots = new ArrayList<SpanQuery>();
318 BooleanClause[] bclauses = qc.getClauses();
320 // For all clauses e.g. one* two~
321 for (int i = 0; i < bclauses.length; i++) {
322 Query childQuery = bclauses[i].getQuery();
324 // select the list to which we will add these options
325 ArrayList<SpanQuery> chosenList = ors;
326 if (bclauses[i].getOccur() == BooleanClause.Occur.MUST_NOT) {
330 if (childQuery instanceof TermQuery) {
331 TermQuery tq = (TermQuery) childQuery;
332 SpanTermQuery stq = new SpanTermQuery(tq.getTerm());
333 stq.setBoost(tq.getBoost());
335 } else if (childQuery instanceof BooleanQuery) {
336 BooleanQuery cbq = (BooleanQuery) childQuery;
337 addComplexPhraseClause(chosenList, cbq);
339 // TODO alternatively could call extract terms here?
340 throw new IllegalArgumentException("Unknown query type:"
341 + childQuery.getClass().getName());
344 if (ors.size() == 0) {
347 SpanOrQuery soq = new SpanOrQuery(ors
348 .toArray(new SpanQuery[ors.size()]));
349 if (nots.size() == 0) {
350 spanClauses.add(soq);
352 SpanOrQuery snqs = new SpanOrQuery(nots
353 .toArray(new SpanQuery[nots.size()]));
354 SpanNotQuery snq = new SpanNotQuery(soq, snqs);
355 spanClauses.add(snq);
360 public String toString(String field) {
361 return "\"" + phrasedQueryStringContents + "\"";
365 public int hashCode() {
366 final int prime = 31;
368 result = prime * result + ((field == null) ? 0 : field.hashCode());
371 + ((phrasedQueryStringContents == null) ? 0
372 : phrasedQueryStringContents.hashCode());
373 result = prime * result + slopFactor;
378 public boolean equals(Object obj) {
383 if (getClass() != obj.getClass())
385 ComplexPhraseQuery other = (ComplexPhraseQuery) obj;
387 if (other.field != null)
389 } else if (!field.equals(other.field))
391 if (phrasedQueryStringContents == null) {
392 if (other.phrasedQueryStringContents != null)
394 } else if (!phrasedQueryStringContents
395 .equals(other.phrasedQueryStringContents))
397 if (slopFactor != other.slopFactor)