pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / highlighter / src / java / org / apache / lucene / search / vectorhighlight / FieldQuery.java
diff --git a/lucene-java-3.5.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java b/lucene-java-3.5.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java

new file mode 100644 (file)

index 0000000..f2fb16d
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java
@@ -0,0 +1,445 @@
+package org.apache.lucene.search.vectorhighlight;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.DisjunctionMaxQuery;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermRangeQuery;
+import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.search.regex.RegexQuery;
+import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
+
+/**
+ * FieldQuery breaks down query object into terms/phrases and keep
+ * them in QueryPhraseMap structure.
+ */
+public class FieldQuery {
+
+  final boolean fieldMatch;
+
+  // fieldMatch==true,  Map<fieldName,QueryPhraseMap>
+  // fieldMatch==false, Map<null,QueryPhraseMap>
+  Map<String, QueryPhraseMap> rootMaps = new HashMap<String, QueryPhraseMap>();
+
+  // fieldMatch==true,  Map<fieldName,setOfTermsInQueries>
+  // fieldMatch==false, Map<null,setOfTermsInQueries>
+  Map<String, Set<String>> termSetMap = new HashMap<String, Set<String>>();
+
+  int termOrPhraseNumber; // used for colored tag support
+
+  // The maximum number of different matching terms accumulated from any one MultiTermQuery
+  private static final int MAX_MTQ_TERMS = 1024;
+
+  FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
+    this.fieldMatch = fieldMatch;
+    List<Query> flatQueries = new ArrayList<Query>();
+    flatten( query, reader, flatQueries );
+    saveTerms( flatQueries, reader );
+    Collection<Query> expandQueries = expand( flatQueries );
+
+    for( Query flatQuery : expandQueries ){
+      QueryPhraseMap rootMap = getRootMap( flatQuery );
+      rootMap.add( flatQuery, reader );
+      if( !phraseHighlight && flatQuery instanceof PhraseQuery ){
+        PhraseQuery pq = (PhraseQuery)flatQuery;
+        if( pq.getTerms().length > 1 ){
+          for( Term term : pq.getTerms() )
+            rootMap.addTerm( term, flatQuery.getBoost() );
+        }
+      }
+    }
+  }
+  
+  /** For backwards compatibility you can initialize FieldQuery without
+   * an IndexReader, which is only required to support MultiTermQuery
+   */
+  FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
+    this (query, null, phraseHighlight, fieldMatch);
+  }
+
+  void flatten( Query sourceQuery, IndexReader reader, Collection<Query> flatQueries ) throws IOException{
+    if( sourceQuery instanceof BooleanQuery ){
+      BooleanQuery bq = (BooleanQuery)sourceQuery;
+      for( BooleanClause clause : bq.getClauses() ){
+        if( !clause.isProhibited() )
+          flatten( clause.getQuery(), reader, flatQueries );
+      }
+    }
+    else if( sourceQuery instanceof DisjunctionMaxQuery ){
+      DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery;
+      for( Query query : dmq ){
+        flatten( query, reader, flatQueries );
+      }
+    }
+    else if( sourceQuery instanceof TermQuery ){
+      if( !flatQueries.contains( sourceQuery ) )
+        flatQueries.add( sourceQuery );
+    }
+    else if (sourceQuery instanceof MultiTermQuery && reader != null) {
+      MultiTermQuery copy = (MultiTermQuery) sourceQuery.clone();
+      copy.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(MAX_MTQ_TERMS));
+      BooleanQuery mtqTerms = (BooleanQuery) copy.rewrite(reader);
+      flatten(mtqTerms, reader, flatQueries);
+    }
+    else if( sourceQuery instanceof PhraseQuery ){
+      if( !flatQueries.contains( sourceQuery ) ){
+        PhraseQuery pq = (PhraseQuery)sourceQuery;
+        if( pq.getTerms().length > 1 )
+          flatQueries.add( pq );
+        else if( pq.getTerms().length == 1 ){
+          flatQueries.add( new TermQuery( pq.getTerms()[0] ) );
+        }
+      }
+    }
+    // else discard queries
+  }
+  
+  /*
+   * Create expandQueries from flatQueries.
+   * 
+   * expandQueries := flatQueries + overlapped phrase queries
+   * 
+   * ex1) flatQueries={a,b,c}
+   *      => expandQueries={a,b,c}
+   * ex2) flatQueries={a,"b c","c d"}
+   *      => expandQueries={a,"b c","c d","b c d"}
+   */
+  Collection<Query> expand( Collection<Query> flatQueries ){
+    List<Query> expandQueries = new ArrayList<Query>();
+    for( Iterator<Query> i = flatQueries.iterator(); i.hasNext(); ){
+      Query query = i.next();
+      i.remove();
+      expandQueries.add( query );
+      if( !( query instanceof PhraseQuery ) ) continue;
+      for( Iterator<Query> j = flatQueries.iterator(); j.hasNext(); ){
+        Query qj = j.next();
+        if( !( qj instanceof PhraseQuery ) ) continue;
+        checkOverlap( expandQueries, (PhraseQuery)query, (PhraseQuery)qj );
+      }
+    }
+    return expandQueries;
+  }
+
+  /*
+   * Check if PhraseQuery A and B have overlapped part.
+   * 
+   * ex1) A="a b", B="b c" => overlap; expandQueries={"a b c"}
+   * ex2) A="b c", B="a b" => overlap; expandQueries={"a b c"}
+   * ex3) A="a b", B="c d" => no overlap; expandQueries={}
+   */
+  private void checkOverlap( Collection<Query> expandQueries, PhraseQuery a, PhraseQuery b ){
+    if( a.getSlop() != b.getSlop() ) return;
+    Term[] ats = a.getTerms();
+    Term[] bts = b.getTerms();
+    if( fieldMatch && !ats[0].field().equals( bts[0].field() ) ) return;
+    checkOverlap( expandQueries, ats, bts, a.getSlop(), a.getBoost() );
+    checkOverlap( expandQueries, bts, ats, b.getSlop(), b.getBoost() );
+  }
+
+  /*
+   * Check if src and dest have overlapped part and if it is, create PhraseQueries and add expandQueries.
+   * 
+   * ex1) src="a b", dest="c d"       => no overlap
+   * ex2) src="a b", dest="a b c"     => no overlap
+   * ex3) src="a b", dest="b c"       => overlap; expandQueries={"a b c"}
+   * ex4) src="a b c", dest="b c d"   => overlap; expandQueries={"a b c d"}
+   * ex5) src="a b c", dest="b c"     => no overlap
+   * ex6) src="a b c", dest="b"       => no overlap
+   * ex7) src="a a a a", dest="a a a" => overlap;
+   *                                     expandQueries={"a a a a a","a a a a a a"}
+   * ex8) src="a b c d", dest="b c"   => no overlap
+   */
+  private void checkOverlap( Collection<Query> expandQueries, Term[] src, Term[] dest, int slop, float boost ){
+    // beginning from 1 (not 0) is safe because that the PhraseQuery has multiple terms
+    // is guaranteed in flatten() method (if PhraseQuery has only one term, flatten()
+    // converts PhraseQuery to TermQuery)
+    for( int i = 1; i < src.length; i++ ){
+      boolean overlap = true;
+      for( int j = i; j < src.length; j++ ){
+        if( ( j - i ) < dest.length && !src[j].text().equals( dest[j-i].text() ) ){
+          overlap = false;
+          break;
+        }
+      }
+      if( overlap && src.length - i < dest.length ){
+        PhraseQuery pq = new PhraseQuery();
+        for( Term srcTerm : src )
+          pq.add( srcTerm );
+        for( int k = src.length - i; k < dest.length; k++ ){
+          pq.add( new Term( src[0].field(), dest[k].text() ) );
+        }
+        pq.setSlop( slop );
+        pq.setBoost( boost );
+        if(!expandQueries.contains( pq ) )
+          expandQueries.add( pq );
+      }
+    }
+  }
+  
+  QueryPhraseMap getRootMap( Query query ){
+    String key = getKey( query );
+    QueryPhraseMap map = rootMaps.get( key );
+    if( map == null ){
+      map = new QueryPhraseMap( this );
+      rootMaps.put( key, map );
+    }
+    return map;
+  }
+  
+  /*
+   * Return 'key' string. 'key' is the field name of the Query.
+   * If not fieldMatch, 'key' will be null.
+   */
+  private String getKey( Query query ){
+    if( !fieldMatch ) return null;
+    if( query instanceof TermQuery )
+      return ((TermQuery)query).getTerm().field();
+    else if ( query instanceof PhraseQuery ){
+      PhraseQuery pq = (PhraseQuery)query;
+      Term[] terms = pq.getTerms();
+      return terms[0].field();
+    }
+    else if (query instanceof FuzzyQuery) {
+      return ((FuzzyQuery)query).getTerm().field();
+    }
+    else if (query instanceof PrefixQuery) {
+      return ((PrefixQuery)query).getPrefix().field();
+    }
+    else if (query instanceof RegexQuery) {
+      return ((RegexQuery)query).getTerm().field();
+    }
+    else if (query instanceof TermRangeQuery) {
+      return ((TermRangeQuery)query).getField();
+    }
+    else if (query instanceof WildcardQuery) {
+      return ((WildcardQuery)query).getTerm().field();
+    }
+    else
+      throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
+  }
+
+  /*
+   * Save the set of terms in the queries to termSetMap.
+   * 
+   * ex1) q=name:john
+   *      - fieldMatch==true
+   *          termSetMap=Map<"name",Set<"john">>
+   *      - fieldMatch==false
+   *          termSetMap=Map<null,Set<"john">>
+   *          
+   * ex2) q=name:john title:manager
+   *      - fieldMatch==true
+   *          termSetMap=Map<"name",Set<"john">,
+   *                         "title",Set<"manager">>
+   *      - fieldMatch==false
+   *          termSetMap=Map<null,Set<"john","manager">>
+   *          
+   * ex3) q=name:"john lennon"
+   *      - fieldMatch==true
+   *          termSetMap=Map<"name",Set<"john","lennon">>
+   *      - fieldMatch==false
+   *          termSetMap=Map<null,Set<"john","lennon">>
+   */
+    void saveTerms( Collection<Query> flatQueries, IndexReader reader ) throws IOException{
+    for( Query query : flatQueries ){
+      Set<String> termSet = getTermSet( query );
+      if( query instanceof TermQuery )
+        termSet.add( ((TermQuery)query).getTerm().text() );
+      else if( query instanceof PhraseQuery ){
+        for( Term term : ((PhraseQuery)query).getTerms() )
+          termSet.add( term.text() );
+      }
+      else if (query instanceof MultiTermQuery && reader != null) {
+        BooleanQuery mtqTerms = (BooleanQuery) query.rewrite(reader);
+        for (BooleanClause clause : mtqTerms.getClauses()) {
+          termSet.add (((TermQuery) clause.getQuery()).getTerm().text());
+        }
+      }
+      else
+        throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
+    }
+  }
+  
+  private Set<String> getTermSet( Query query ){
+    String key = getKey( query );
+    Set<String> set = termSetMap.get( key );
+    if( set == null ){
+      set = new HashSet<String>();
+      termSetMap.put( key, set );
+    }
+    return set;
+  }
+  
+  Set<String> getTermSet( String field ){
+    return termSetMap.get( fieldMatch ? field : null );
+  }
+
+  /**
+   * 
+   * @param fieldName
+   * @param term
+   * @return QueryPhraseMap
+   */
+  public QueryPhraseMap getFieldTermMap( String fieldName, String term ){
+    QueryPhraseMap rootMap = getRootMap( fieldName );
+    return rootMap == null ? null : rootMap.subMap.get( term );
+  }
+
+  /**
+   * 
+   * @param fieldName
+   * @param phraseCandidate
+   * @return QueryPhraseMap
+   */
+  public QueryPhraseMap searchPhrase( String fieldName, final List<TermInfo> phraseCandidate ){
+    QueryPhraseMap root = getRootMap( fieldName );
+    if( root == null ) return null;
+    return root.searchPhrase( phraseCandidate );
+  }
+  
+  private QueryPhraseMap getRootMap( String fieldName ){
+    return rootMaps.get( fieldMatch ? fieldName : null );
+  }
+  
+  int nextTermOrPhraseNumber(){
+    return termOrPhraseNumber++;
+  }
+  
+  public static class QueryPhraseMap {
+
+    boolean terminal;
+    int slop;   // valid if terminal == true and phraseHighlight == true
+    float boost;  // valid if terminal == true
+    int termOrPhraseNumber;   // valid if terminal == true
+    FieldQuery fieldQuery;
+    Map<String, QueryPhraseMap> subMap = new HashMap<String, QueryPhraseMap>();
+    
+    public QueryPhraseMap( FieldQuery fieldQuery ){
+      this.fieldQuery = fieldQuery;
+    }
+
+    void addTerm( Term term, float boost ){
+      QueryPhraseMap map = getOrNewMap( subMap, term.text() );
+      map.markTerminal( boost );
+    }
+    
+    private QueryPhraseMap getOrNewMap( Map<String, QueryPhraseMap> subMap, String term ){
+      QueryPhraseMap map = subMap.get( term );
+      if( map == null ){
+        map = new QueryPhraseMap( fieldQuery );
+        subMap.put( term, map );
+      }
+      return map;
+    }
+
+      void add( Query query, IndexReader reader ) throws IOException {
+      if( query instanceof TermQuery ){
+        addTerm( ((TermQuery)query).getTerm(), query.getBoost() );
+      }
+      else if( query instanceof PhraseQuery ){
+        PhraseQuery pq = (PhraseQuery)query;
+        Term[] terms = pq.getTerms();
+        Map<String, QueryPhraseMap> map = subMap;
+        QueryPhraseMap qpm = null;
+        for( Term term : terms ){
+          qpm = getOrNewMap( map, term.text() );
+          map = qpm.subMap;
+        }
+        qpm.markTerminal( pq.getSlop(), pq.getBoost() );
+      }
+      else
+        throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
+    }
+    
+    public QueryPhraseMap getTermMap( String term ){
+      return subMap.get( term );
+    }
+    
+    private void markTerminal( float boost ){
+      markTerminal( 0, boost );
+    }
+    
+    private void markTerminal( int slop, float boost ){
+      this.terminal = true;
+      this.slop = slop;
+      this.boost = boost;
+      this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
+    }
+    
+    public boolean isTerminal(){
+      return terminal;
+    }
+    
+    public int getSlop(){
+      return slop;
+    }
+    
+    public float getBoost(){
+      return boost;
+    }
+    
+    public int getTermOrPhraseNumber(){
+      return termOrPhraseNumber;
+    }
+    
+    public QueryPhraseMap searchPhrase( final List<TermInfo> phraseCandidate ){
+      QueryPhraseMap currMap = this;
+      for( TermInfo ti : phraseCandidate ){
+        currMap = currMap.subMap.get( ti.getText() );
+        if( currMap == null ) return null;
+      }
+      return currMap.isValidTermOrPhrase( phraseCandidate ) ? currMap : null;
+    }
+    
+    public boolean isValidTermOrPhrase( final List<TermInfo> phraseCandidate ){
+      // check terminal
+      if( !terminal ) return false;
+
+      // if the candidate is a term, it is valid
+      if( phraseCandidate.size() == 1 ) return true;
+
+      // else check whether the candidate is valid phrase
+      // compare position-gaps between terms to slop
+      int pos = phraseCandidate.get( 0 ).getPosition();
+      for( int i = 1; i < phraseCandidate.size(); i++ ){
+        int nextPos = phraseCandidate.get( i ).getPosition();
+        if( Math.abs( nextPos - pos - 1 ) > slop ) return false;
+        pos = nextPos;
+      }
+      return true;
+    }
+  }
+}