1 package org.apache.lucene.search.vectorhighlight;
3 * Licensed to the Apache Software Foundation (ASF) under one or more
4 * contributor license agreements. See the NOTICE file distributed with
5 * this work for additional information regarding copyright ownership.
6 * The ASF licenses this file to You under the Apache License, Version 2.0
7 * (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
19 import java.io.IOException;
20 import java.util.ArrayList;
21 import java.util.Collection;
22 import java.util.HashMap;
23 import java.util.HashSet;
24 import java.util.Iterator;
25 import java.util.List;
29 import org.apache.lucene.index.IndexReader;
30 import org.apache.lucene.index.Term;
31 import org.apache.lucene.search.BooleanClause;
32 import org.apache.lucene.search.BooleanQuery;
33 import org.apache.lucene.search.DisjunctionMaxQuery;
34 import org.apache.lucene.search.FuzzyQuery;
35 import org.apache.lucene.search.MultiTermQuery;
36 import org.apache.lucene.search.PhraseQuery;
37 import org.apache.lucene.search.PrefixQuery;
38 import org.apache.lucene.search.Query;
39 import org.apache.lucene.search.TermQuery;
40 import org.apache.lucene.search.TermRangeQuery;
41 import org.apache.lucene.search.WildcardQuery;
42 import org.apache.lucene.search.regex.RegexQuery;
43 import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
46 * FieldQuery breaks down query object into terms/phrases and keep
47 * them in QueryPhraseMap structure.
49 public class FieldQuery {
51 final boolean fieldMatch;
53 // fieldMatch==true, Map<fieldName,QueryPhraseMap>
54 // fieldMatch==false, Map<null,QueryPhraseMap>
55 Map<String, QueryPhraseMap> rootMaps = new HashMap<String, QueryPhraseMap>();
57 // fieldMatch==true, Map<fieldName,setOfTermsInQueries>
58 // fieldMatch==false, Map<null,setOfTermsInQueries>
59 Map<String, Set<String>> termSetMap = new HashMap<String, Set<String>>();
61 int termOrPhraseNumber; // used for colored tag support
63 // The maximum number of different matching terms accumulated from any one MultiTermQuery
64 private static final int MAX_MTQ_TERMS = 1024;
66 FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
67 this.fieldMatch = fieldMatch;
68 List<Query> flatQueries = new ArrayList<Query>();
69 flatten( query, reader, flatQueries );
70 saveTerms( flatQueries, reader );
71 Collection<Query> expandQueries = expand( flatQueries );
73 for( Query flatQuery : expandQueries ){
74 QueryPhraseMap rootMap = getRootMap( flatQuery );
75 rootMap.add( flatQuery, reader );
76 if( !phraseHighlight && flatQuery instanceof PhraseQuery ){
77 PhraseQuery pq = (PhraseQuery)flatQuery;
78 if( pq.getTerms().length > 1 ){
79 for( Term term : pq.getTerms() )
80 rootMap.addTerm( term, flatQuery.getBoost() );
86 /** For backwards compatibility you can initialize FieldQuery without
87 * an IndexReader, which is only required to support MultiTermQuery
89 FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
90 this (query, null, phraseHighlight, fieldMatch);
93 void flatten( Query sourceQuery, IndexReader reader, Collection<Query> flatQueries ) throws IOException{
94 if( sourceQuery instanceof BooleanQuery ){
95 BooleanQuery bq = (BooleanQuery)sourceQuery;
96 for( BooleanClause clause : bq.getClauses() ){
97 if( !clause.isProhibited() )
98 flatten( clause.getQuery(), reader, flatQueries );
101 else if( sourceQuery instanceof DisjunctionMaxQuery ){
102 DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery;
103 for( Query query : dmq ){
104 flatten( query, reader, flatQueries );
107 else if( sourceQuery instanceof TermQuery ){
108 if( !flatQueries.contains( sourceQuery ) )
109 flatQueries.add( sourceQuery );
111 else if (sourceQuery instanceof MultiTermQuery && reader != null) {
112 MultiTermQuery copy = (MultiTermQuery) sourceQuery.clone();
113 copy.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(MAX_MTQ_TERMS));
114 BooleanQuery mtqTerms = (BooleanQuery) copy.rewrite(reader);
115 flatten(mtqTerms, reader, flatQueries);
117 else if( sourceQuery instanceof PhraseQuery ){
118 if( !flatQueries.contains( sourceQuery ) ){
119 PhraseQuery pq = (PhraseQuery)sourceQuery;
120 if( pq.getTerms().length > 1 )
121 flatQueries.add( pq );
122 else if( pq.getTerms().length == 1 ){
123 flatQueries.add( new TermQuery( pq.getTerms()[0] ) );
127 // else discard queries
131 * Create expandQueries from flatQueries.
133 * expandQueries := flatQueries + overlapped phrase queries
135 * ex1) flatQueries={a,b,c}
136 * => expandQueries={a,b,c}
137 * ex2) flatQueries={a,"b c","c d"}
138 * => expandQueries={a,"b c","c d","b c d"}
140 Collection<Query> expand( Collection<Query> flatQueries ){
141 List<Query> expandQueries = new ArrayList<Query>();
142 for( Iterator<Query> i = flatQueries.iterator(); i.hasNext(); ){
143 Query query = i.next();
145 expandQueries.add( query );
146 if( !( query instanceof PhraseQuery ) ) continue;
147 for( Iterator<Query> j = flatQueries.iterator(); j.hasNext(); ){
149 if( !( qj instanceof PhraseQuery ) ) continue;
150 checkOverlap( expandQueries, (PhraseQuery)query, (PhraseQuery)qj );
153 return expandQueries;
157 * Check if PhraseQuery A and B have overlapped part.
159 * ex1) A="a b", B="b c" => overlap; expandQueries={"a b c"}
160 * ex2) A="b c", B="a b" => overlap; expandQueries={"a b c"}
161 * ex3) A="a b", B="c d" => no overlap; expandQueries={}
163 private void checkOverlap( Collection<Query> expandQueries, PhraseQuery a, PhraseQuery b ){
164 if( a.getSlop() != b.getSlop() ) return;
165 Term[] ats = a.getTerms();
166 Term[] bts = b.getTerms();
167 if( fieldMatch && !ats[0].field().equals( bts[0].field() ) ) return;
168 checkOverlap( expandQueries, ats, bts, a.getSlop(), a.getBoost() );
169 checkOverlap( expandQueries, bts, ats, b.getSlop(), b.getBoost() );
173 * Check if src and dest have overlapped part and if it is, create PhraseQueries and add expandQueries.
175 * ex1) src="a b", dest="c d" => no overlap
176 * ex2) src="a b", dest="a b c" => no overlap
177 * ex3) src="a b", dest="b c" => overlap; expandQueries={"a b c"}
178 * ex4) src="a b c", dest="b c d" => overlap; expandQueries={"a b c d"}
179 * ex5) src="a b c", dest="b c" => no overlap
180 * ex6) src="a b c", dest="b" => no overlap
181 * ex7) src="a a a a", dest="a a a" => overlap;
182 * expandQueries={"a a a a a","a a a a a a"}
183 * ex8) src="a b c d", dest="b c" => no overlap
185 private void checkOverlap( Collection<Query> expandQueries, Term[] src, Term[] dest, int slop, float boost ){
186 // beginning from 1 (not 0) is safe because that the PhraseQuery has multiple terms
187 // is guaranteed in flatten() method (if PhraseQuery has only one term, flatten()
188 // converts PhraseQuery to TermQuery)
189 for( int i = 1; i < src.length; i++ ){
190 boolean overlap = true;
191 for( int j = i; j < src.length; j++ ){
192 if( ( j - i ) < dest.length && !src[j].text().equals( dest[j-i].text() ) ){
197 if( overlap && src.length - i < dest.length ){
198 PhraseQuery pq = new PhraseQuery();
199 for( Term srcTerm : src )
201 for( int k = src.length - i; k < dest.length; k++ ){
202 pq.add( new Term( src[0].field(), dest[k].text() ) );
205 pq.setBoost( boost );
206 if(!expandQueries.contains( pq ) )
207 expandQueries.add( pq );
212 QueryPhraseMap getRootMap( Query query ){
213 String key = getKey( query );
214 QueryPhraseMap map = rootMaps.get( key );
216 map = new QueryPhraseMap( this );
217 rootMaps.put( key, map );
223 * Return 'key' string. 'key' is the field name of the Query.
224 * If not fieldMatch, 'key' will be null.
226 private String getKey( Query query ){
227 if( !fieldMatch ) return null;
228 if( query instanceof TermQuery )
229 return ((TermQuery)query).getTerm().field();
230 else if ( query instanceof PhraseQuery ){
231 PhraseQuery pq = (PhraseQuery)query;
232 Term[] terms = pq.getTerms();
233 return terms[0].field();
235 else if (query instanceof FuzzyQuery) {
236 return ((FuzzyQuery)query).getTerm().field();
238 else if (query instanceof PrefixQuery) {
239 return ((PrefixQuery)query).getPrefix().field();
241 else if (query instanceof RegexQuery) {
242 return ((RegexQuery)query).getTerm().field();
244 else if (query instanceof TermRangeQuery) {
245 return ((TermRangeQuery)query).getField();
247 else if (query instanceof WildcardQuery) {
248 return ((WildcardQuery)query).getTerm().field();
251 throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
255 * Save the set of terms in the queries to termSetMap.
259 * termSetMap=Map<"name",Set<"john">>
260 * - fieldMatch==false
261 * termSetMap=Map<null,Set<"john">>
263 * ex2) q=name:john title:manager
265 * termSetMap=Map<"name",Set<"john">,
266 * "title",Set<"manager">>
267 * - fieldMatch==false
268 * termSetMap=Map<null,Set<"john","manager">>
270 * ex3) q=name:"john lennon"
272 * termSetMap=Map<"name",Set<"john","lennon">>
273 * - fieldMatch==false
274 * termSetMap=Map<null,Set<"john","lennon">>
276 void saveTerms( Collection<Query> flatQueries, IndexReader reader ) throws IOException{
277 for( Query query : flatQueries ){
278 Set<String> termSet = getTermSet( query );
279 if( query instanceof TermQuery )
280 termSet.add( ((TermQuery)query).getTerm().text() );
281 else if( query instanceof PhraseQuery ){
282 for( Term term : ((PhraseQuery)query).getTerms() )
283 termSet.add( term.text() );
285 else if (query instanceof MultiTermQuery && reader != null) {
286 BooleanQuery mtqTerms = (BooleanQuery) query.rewrite(reader);
287 for (BooleanClause clause : mtqTerms.getClauses()) {
288 termSet.add (((TermQuery) clause.getQuery()).getTerm().text());
292 throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
296 private Set<String> getTermSet( Query query ){
297 String key = getKey( query );
298 Set<String> set = termSetMap.get( key );
300 set = new HashSet<String>();
301 termSetMap.put( key, set );
306 Set<String> getTermSet( String field ){
307 return termSetMap.get( fieldMatch ? field : null );
314 * @return QueryPhraseMap
316 public QueryPhraseMap getFieldTermMap( String fieldName, String term ){
317 QueryPhraseMap rootMap = getRootMap( fieldName );
318 return rootMap == null ? null : rootMap.subMap.get( term );
324 * @param phraseCandidate
325 * @return QueryPhraseMap
327 public QueryPhraseMap searchPhrase( String fieldName, final List<TermInfo> phraseCandidate ){
328 QueryPhraseMap root = getRootMap( fieldName );
329 if( root == null ) return null;
330 return root.searchPhrase( phraseCandidate );
333 private QueryPhraseMap getRootMap( String fieldName ){
334 return rootMaps.get( fieldMatch ? fieldName : null );
337 int nextTermOrPhraseNumber(){
338 return termOrPhraseNumber++;
341 public static class QueryPhraseMap {
344 int slop; // valid if terminal == true and phraseHighlight == true
345 float boost; // valid if terminal == true
346 int termOrPhraseNumber; // valid if terminal == true
347 FieldQuery fieldQuery;
348 Map<String, QueryPhraseMap> subMap = new HashMap<String, QueryPhraseMap>();
350 public QueryPhraseMap( FieldQuery fieldQuery ){
351 this.fieldQuery = fieldQuery;
354 void addTerm( Term term, float boost ){
355 QueryPhraseMap map = getOrNewMap( subMap, term.text() );
356 map.markTerminal( boost );
359 private QueryPhraseMap getOrNewMap( Map<String, QueryPhraseMap> subMap, String term ){
360 QueryPhraseMap map = subMap.get( term );
362 map = new QueryPhraseMap( fieldQuery );
363 subMap.put( term, map );
368 void add( Query query, IndexReader reader ) throws IOException {
369 if( query instanceof TermQuery ){
370 addTerm( ((TermQuery)query).getTerm(), query.getBoost() );
372 else if( query instanceof PhraseQuery ){
373 PhraseQuery pq = (PhraseQuery)query;
374 Term[] terms = pq.getTerms();
375 Map<String, QueryPhraseMap> map = subMap;
376 QueryPhraseMap qpm = null;
377 for( Term term : terms ){
378 qpm = getOrNewMap( map, term.text() );
381 qpm.markTerminal( pq.getSlop(), pq.getBoost() );
384 throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
387 public QueryPhraseMap getTermMap( String term ){
388 return subMap.get( term );
391 private void markTerminal( float boost ){
392 markTerminal( 0, boost );
395 private void markTerminal( int slop, float boost ){
396 this.terminal = true;
399 this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
402 public boolean isTerminal(){
406 public int getSlop(){
410 public float getBoost(){
414 public int getTermOrPhraseNumber(){
415 return termOrPhraseNumber;
418 public QueryPhraseMap searchPhrase( final List<TermInfo> phraseCandidate ){
419 QueryPhraseMap currMap = this;
420 for( TermInfo ti : phraseCandidate ){
421 currMap = currMap.subMap.get( ti.getText() );
422 if( currMap == null ) return null;
424 return currMap.isValidTermOrPhrase( phraseCandidate ) ? currMap : null;
427 public boolean isValidTermOrPhrase( final List<TermInfo> phraseCandidate ){
429 if( !terminal ) return false;
431 // if the candidate is a term, it is valid
432 if( phraseCandidate.size() == 1 ) return true;
434 // else check whether the candidate is valid phrase
435 // compare position-gaps between terms to slop
436 int pos = phraseCandidate.get( 0 ).getPosition();
437 for( int i = 1; i < phraseCandidate.size(); i++ ){
438 int nextPos = phraseCandidate.get( i ).getPosition();
439 if( Math.abs( nextPos - pos - 1 ) > slop ) return false;