lucene-java-3.4.0/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java

   1 /**
   2  * Copyright 2004-2005 The Apache Software Foundation.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16 package org.apache.lucene.search.similar;
  17
  18 import java.io.File;
  19 import java.io.FileReader;
  20 import java.io.IOException;
  21 import java.io.InputStreamReader;
  22 import java.io.PrintStream;
  23 import java.io.Reader;
  24 import java.io.StringReader;
  25 import java.net.URL;
  26 import java.util.ArrayList;
  27 import java.util.Collection;
  28 import java.util.HashMap;
  29 import java.util.Iterator;
  30 import java.util.Map;
  31 import java.util.Set;
  32
  33 import org.apache.lucene.analysis.Analyzer;
  34 import org.apache.lucene.analysis.TokenStream;
  35 import org.apache.lucene.analysis.standard.StandardAnalyzer;
  36 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  37 import org.apache.lucene.document.Document;
  38 import org.apache.lucene.index.IndexReader;
  39 import org.apache.lucene.index.Term;
  40 import org.apache.lucene.index.TermFreqVector;
  41 import org.apache.lucene.search.BooleanClause;
  42 import org.apache.lucene.search.BooleanQuery;
  43 import org.apache.lucene.search.DefaultSimilarity;
  44 import org.apache.lucene.search.IndexSearcher;
  45 import org.apache.lucene.search.Query;
  46 import org.apache.lucene.search.ScoreDoc;
  47 import org.apache.lucene.search.Similarity;
  48 import org.apache.lucene.search.TermQuery;
  49 import org.apache.lucene.search.TopDocs;
  50 import org.apache.lucene.store.FSDirectory;
  51 import org.apache.lucene.util.PriorityQueue;
  52 import org.apache.lucene.util.Version;
  53
  54 /**
  55  * Generate "more like this" similarity queries. Based on this mail: <code><pre>
  56  * Lucene does let you access the document frequency of terms, with IndexReader.docFreq().
  57  * Term frequencies can be computed by re-tokenizing the text, which, for a single document,
  58  * is usually fast enough.  But looking up the docFreq() of every term in the document is
  59  * probably too slow.
  60  *
  61  * You can use some heuristics to prune the set of terms, to avoid calling docFreq() too much,
  62  * or at all.  Since you're trying to maximize a tf*idf score, you're probably most interested
  63  * in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
  64  * reduce the number of terms under consideration.  Another heuristic is that terms with a
  65  * high idf (i.e., a low df) tend to be longer.  So you could threshold the terms by the
  66  * number of characters, not selecting anything less than, e.g., six or seven characters.
  67  * With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
  68  * that do a pretty good job of characterizing a document.
  69  *
  70  * It all depends on what you're trying to do.  If you're trying to eek out that last percent
  71  * of precision and recall regardless of computational difficulty so that you can win a TREC
  72  * competition, then the techniques I mention above are useless.  But if you're trying to
  73  * provide a "more like this" button on a search results page that does a decent job and has
  74  * good performance, such techniques might be useful.
  75  *
  76  * An efficient, effective "more-like-this" query generator would be a great contribution, if
  77  * anyone's interested.  I'd imagine that it would take a Reader or a String (the document's
  78  * text), analyzer Analyzer, and return a set of representative terms using heuristics like those
  79  * above.  The frequency and length thresholds could be parameters, etc.
  80  *
  81  * Doug
  82  * </pre></code>
  83  *
  84  *
  85  * <p>
  86  * <h3>Initial Usage</h3>
  87  * <p/>
  88  * This class has lots of options to try to make it efficient and flexible.
  89  * The simplest possible usage is as follows. The bold
  90  * fragment is specific to this class.
  91  *
  92  * <pre class="prettyprint">
  93  *
  94  * IndexReader ir = ...
  95  * IndexSearcher is = ...
  96  *
  97  * MoreLikeThis mlt = new MoreLikeThis(ir);
  98  * Reader target = ... // orig source of doc you want to find similarities to
  99  * Query query = mlt.like( target);
 100  *
 101  * Hits hits = is.search(query);
 102  * // now the usual iteration thru 'hits' - the only thing to watch for is to make sure
 103  * //you ignore the doc if it matches your 'target' document, as it should be similar to itself
 104  *
 105  * </pre>
 106  *
 107  * Thus you:
 108  * <ol>
 109  * <li>do your normal, Lucene setup for searching,
 110  * <li>create a MoreLikeThis,
 111  * <li>get the text of the doc you want to find similarities to
 112  * <li>then call one of the like() calls to generate a similarity query
 113  * <li>call the searcher to find the similar docs
 114  * </ol>
 115  *
 116  * <h3>More Advanced Usage</h3>
 117  *
 118  * You may want to use {@link #setFieldNames setFieldNames(...)} so you can
 119  * examine multiple fields (e.g. body and title) for similarity.
 120  * <p>
 121  *
 122  * Depending on the size of your index and the size and makeup of your documents
 123  * you may want to call the other set methods to control how the similarity
 124  * queries are generated:
 125  * <ul>
 126  * <li> {@link #setMinTermFreq setMinTermFreq(...)}
 127  * <li> {@link #setMinDocFreq setMinDocFreq(...)}
 128  * <li> {@link #setMaxDocFreq setMaxDocFreq(...)}
 129  * <li> {@link #setMaxDocFreqPct setMaxDocFreqPct(...)}
 130  * <li> {@link #setMinWordLen setMinWordLen(...)}
 131  * <li> {@link #setMaxWordLen setMaxWordLen(...)}
 132  * <li> {@link #setMaxQueryTerms setMaxQueryTerms(...)}
 133  * <li> {@link #setMaxNumTokensParsed setMaxNumTokensParsed(...)}
 134  * <li> {@link #setStopWords setStopWord(...)}
 135  * </ul>
 136  *
 137  * <hr>
 138  *
 139  * <pre>
 140  * Changes: Mark Harwood 29/02/04
 141  * Some bugfixing, some refactoring, some optimisation.
 142  *  - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
 143  *  - bugfix: No significant terms being created for fields with a termvector - because
 144  *            was only counting one occurrence per term/field pair in calculations(ie not including frequency info from TermVector)
 145  *  - refactor: moved common code into isNoiseWord()
 146  *  - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
 147  * </pre>
 148  *
 149  */
 150 public final class MoreLikeThis {
 151
 152   /**
 153    * Default maximum number of tokens to parse in each example doc field that is
 154    * not stored with TermVector support.
 155    *
 156    * @see #getMaxNumTokensParsed
 157    */
 158   public static final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
 159
 160   /**
 161    * Default analyzer to parse source doc with.
 162    *
 163    * @see #getAnalyzer
 164    * @deprecated This default will be removed in Lucene 4.0 (with the default
 165    *             being null). If you are not using term vectors, explicitly set
 166    *             your analyzer instead.
 167    */
 168   @Deprecated
 169   public static final Analyzer DEFAULT_ANALYZER = new StandardAnalyzer(
 170       Version.LUCENE_CURRENT);
 171
 172   /**
 173    * Ignore terms with less than this frequency in the source doc.
 174    *
 175    * @see #getMinTermFreq
 176    * @see #setMinTermFreq
 177    */
 178   public static final int DEFAULT_MIN_TERM_FREQ = 2;
 179
 180   /**
 181    * Ignore words which do not occur in at least this many docs.
 182    *
 183    * @see #getMinDocFreq
 184    * @see #setMinDocFreq
 185    */
 186   public static final int DEFAULT_MIN_DOC_FREQ = 5;
 187
 188   /**
 189    * Ignore words which occur in more than this many docs.
 190    *
 191    * @see #getMaxDocFreq
 192    * @see #setMaxDocFreq
 193    * @see #setMaxDocFreqPct
 194    */
 195   public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE;
 196
 197   /**
 198    * Boost terms in query based on score.
 199    *
 200    * @see #isBoost
 201    * @see #setBoost
 202    */
 203   public static final boolean DEFAULT_BOOST = false;
 204
 205   /**
 206    * Default field names. Null is used to specify that the field names should be
 207    * looked up at runtime from the provided reader.
 208    */
 209   public static final String[] DEFAULT_FIELD_NAMES = new String[] {"contents"};
 210
 211   /**
 212    * Ignore words less than this length or if 0 then this has no effect.
 213    *
 214    * @see #getMinWordLen
 215    * @see #setMinWordLen
 216    */
 217   public static final int DEFAULT_MIN_WORD_LENGTH = 0;
 218
 219   /**
 220    * Ignore words greater than this length or if 0 then this has no effect.
 221    *
 222    * @see #getMaxWordLen
 223    * @see #setMaxWordLen
 224    */
 225   public static final int DEFAULT_MAX_WORD_LENGTH = 0;
 226
 227   /**
 228    * Default set of stopwords. If null means to allow stop words.
 229    *
 230    * @see #setStopWords
 231    * @see #getStopWords
 232    */
 233   public static final Set<?> DEFAULT_STOP_WORDS = null;
 234
 235   /**
 236    * Current set of stop words.
 237    */
 238   private Set<?> stopWords = DEFAULT_STOP_WORDS;
 239
 240   /**
 241    * Return a Query with no more than this many terms.
 242    *
 243    * @see BooleanQuery#getMaxClauseCount
 244    * @see #getMaxQueryTerms
 245    * @see #setMaxQueryTerms
 246    */
 247   public static final int DEFAULT_MAX_QUERY_TERMS = 25;
 248
 249   /**
 250    * Analyzer that will be used to parse the doc.
 251    */
 252   private Analyzer analyzer = DEFAULT_ANALYZER;
 253
 254   /**
 255    * Ignore words less frequent that this.
 256    */
 257   private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
 258
 259   /**
 260    * Ignore words which do not occur in at least this many docs.
 261    */
 262   private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
 263
 264   /**
 265    * Ignore words which occur in more than this many docs.
 266    */
 267   private int maxDocFreq = DEFAULT_MAX_DOC_FREQ;
 268
 269   /**
 270    * Should we apply a boost to the Query based on the scores?
 271    */
 272   private boolean boost = DEFAULT_BOOST;
 273
 274   /**
 275    * Field name we'll analyze.
 276    */
 277   private String[] fieldNames = DEFAULT_FIELD_NAMES;
 278
 279   /**
 280    * The maximum number of tokens to parse in each example doc field that is not
 281    * stored with TermVector support
 282    */
 283   private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
 284
 285   /**
 286    * Ignore words if less than this len.
 287    */
 288   private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
 289
 290   /**
 291    * Ignore words if greater than this len.
 292    */
 293   private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
 294
 295   /**
 296    * Don't return a query longer than this.
 297    */
 298   private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
 299
 300   /**
 301    * For idf() calculations.
 302    */
 303   private Similarity similarity;// = new DefaultSimilarity();
 304
 305   /**
 306    * IndexReader to use
 307    */
 308   private final IndexReader ir;
 309
 310   /**
 311    * Boost factor to use when boosting the terms
 312    */
 313   private float boostFactor = 1;
 314
 315   /**
 316    * Returns the boost factor used when boosting terms
 317    *
 318    * @return the boost factor used when boosting terms
 319    */
 320   public float getBoostFactor() {
 321     return boostFactor;
 322   }
 323
 324   /**
 325    * Sets the boost factor to use when boosting terms
 326    *
 327    * @param boostFactor
 328    */
 329   public void setBoostFactor(float boostFactor) {
 330     this.boostFactor = boostFactor;
 331   }
 332
 333   /**
 334    * Constructor requiring an IndexReader.
 335    */
 336   public MoreLikeThis(IndexReader ir) {
 337     this(ir, new DefaultSimilarity());
 338   }
 339
 340   public MoreLikeThis(IndexReader ir, Similarity sim) {
 341     this.ir = ir;
 342     this.similarity = sim;
 343   }
 344
 345   public Similarity getSimilarity() {
 346     return similarity;
 347   }
 348
 349   public void setSimilarity(Similarity similarity) {
 350     this.similarity = similarity;
 351   }
 352
 353   /**
 354    * Returns an analyzer that will be used to parse source doc with. The default
 355    * analyzer is the {@link #DEFAULT_ANALYZER}.
 356    *
 357    * @return the analyzer that will be used to parse source doc with.
 358    * @see #DEFAULT_ANALYZER
 359    */
 360   public Analyzer getAnalyzer() {
 361     return analyzer;
 362   }
 363
 364   /**
 365    * Sets the analyzer to use. An analyzer is not required for generating a
 366    * query with the {@link #like(int)} method, all other 'like' methods require
 367    * an analyzer.
 368    *
 369    * @param analyzer
 370    *          the analyzer to use to tokenize text.
 371    */
 372   public void setAnalyzer(Analyzer analyzer) {
 373     this.analyzer = analyzer;
 374   }
 375
 376   /**
 377    * Returns the frequency below which terms will be ignored in the source doc.
 378    * The default frequency is the {@link #DEFAULT_MIN_TERM_FREQ}.
 379    *
 380    * @return the frequency below which terms will be ignored in the source doc.
 381    */
 382   public int getMinTermFreq() {
 383     return minTermFreq;
 384   }
 385
 386   /**
 387    * Sets the frequency below which terms will be ignored in the source doc.
 388    *
 389    * @param minTermFreq
 390    *          the frequency below which terms will be ignored in the source doc.
 391    */
 392   public void setMinTermFreq(int minTermFreq) {
 393     this.minTermFreq = minTermFreq;
 394   }
 395
 396   /**
 397    * Returns the frequency at which words will be ignored which do not occur in
 398    * at least this many docs. The default frequency is
 399    * {@link #DEFAULT_MIN_DOC_FREQ}.
 400    *
 401    * @return the frequency at which words will be ignored which do not occur in
 402    *         at least this many docs.
 403    */
 404   public int getMinDocFreq() {
 405     return minDocFreq;
 406   }
 407
 408   /**
 409    * Sets the frequency at which words will be ignored which do not occur in at
 410    * least this many docs.
 411    *
 412    * @param minDocFreq
 413    *          the frequency at which words will be ignored which do not occur in
 414    *          at least this many docs.
 415    */
 416   public void setMinDocFreq(int minDocFreq) {
 417     this.minDocFreq = minDocFreq;
 418   }
 419
 420   /**
 421    * Returns the maximum frequency in which words may still appear. Words that
 422    * appear in more than this many docs will be ignored. The default frequency
 423    * is {@link #DEFAULT_MAX_DOC_FREQ}.
 424    *
 425    * @return get the maximum frequency at which words are still allowed, words
 426    *         which occur in more docs than this are ignored.
 427    */
 428   public int getMaxDocFreq() {
 429     return maxDocFreq;
 430   }
 431
 432   /**
 433    * Set the maximum frequency in which words may still appear. Words that
 434    * appear in more than this many docs will be ignored.
 435    *
 436    * @param maxFreq
 437    *          the maximum count of documents that a term may appear in to be
 438    *          still considered relevant
 439    */
 440   public void setMaxDocFreq(int maxFreq) {
 441     this.maxDocFreq = maxFreq;
 442   }
 443
 444   /**
 445    * Set the maximum percentage in which words may still appear. Words that
 446    * appear in more than this many percent of all docs will be ignored.
 447    *
 448    * @param maxPercentage
 449    *          the maximum percentage of documents (0-100) that a term may appear
 450    *          in to be still considered relevant
 451    */
 452   public void setMaxDocFreqPct(int maxPercentage) {
 453     this.maxDocFreq = maxPercentage * ir.numDocs() / 100;
 454   }
 455
 456   /**
 457    * Returns whether to boost terms in query based on "score" or not. The
 458    * default is {@link #DEFAULT_BOOST}.
 459    *
 460    * @return whether to boost terms in query based on "score" or not.
 461    * @see #setBoost
 462    */
 463   public boolean isBoost() {
 464     return boost;
 465   }
 466
 467   /**
 468    * Sets whether to boost terms in query based on "score" or not.
 469    *
 470    * @param boost
 471    *          true to boost terms in query based on "score", false otherwise.
 472    * @see #isBoost
 473    */
 474   public void setBoost(boolean boost) {
 475     this.boost = boost;
 476   }
 477
 478   /**
 479    * Returns the field names that will be used when generating the 'More Like
 480    * This' query. The default field names that will be used is
 481    * {@link #DEFAULT_FIELD_NAMES}.
 482    *
 483    * @return the field names that will be used when generating the 'More Like
 484    *         This' query.
 485    */
 486   public String[] getFieldNames() {
 487     return fieldNames;
 488   }
 489
 490   /**
 491    * Sets the field names that will be used when generating the 'More Like This'
 492    * query. Set this to null for the field names to be determined at runtime
 493    * from the IndexReader provided in the constructor.
 494    *
 495    * @param fieldNames
 496    *          the field names that will be used when generating the 'More Like
 497    *          This' query.
 498    */
 499   public void setFieldNames(String[] fieldNames) {
 500     this.fieldNames = fieldNames;
 501   }
 502
 503   /**
 504    * Returns the minimum word length below which words will be ignored. Set this
 505    * to 0 for no minimum word length. The default is
 506    * {@link #DEFAULT_MIN_WORD_LENGTH}.
 507    *
 508    * @return the minimum word length below which words will be ignored.
 509    */
 510   public int getMinWordLen() {
 511     return minWordLen;
 512   }
 513
 514   /**
 515    * Sets the minimum word length below which words will be ignored.
 516    *
 517    * @param minWordLen
 518    *          the minimum word length below which words will be ignored.
 519    */
 520   public void setMinWordLen(int minWordLen) {
 521     this.minWordLen = minWordLen;
 522   }
 523
 524   /**
 525    * Returns the maximum word length above which words will be ignored. Set this
 526    * to 0 for no maximum word length. The default is
 527    * {@link #DEFAULT_MAX_WORD_LENGTH}.
 528    *
 529    * @return the maximum word length above which words will be ignored.
 530    */
 531   public int getMaxWordLen() {
 532     return maxWordLen;
 533   }
 534
 535   /**
 536    * Sets the maximum word length above which words will be ignored.
 537    *
 538    * @param maxWordLen
 539    *          the maximum word length above which words will be ignored.
 540    */
 541   public void setMaxWordLen(int maxWordLen) {
 542     this.maxWordLen = maxWordLen;
 543   }
 544
 545   /**
 546    * Set the set of stopwords. Any word in this set is considered
 547    * "uninteresting" and ignored. Even if your Analyzer allows stopwords, you
 548    * might want to tell the MoreLikeThis code to ignore them, as for the
 549    * purposes of document similarity it seems reasonable to assume that
 550    * "a stop word is never interesting".
 551    *
 552    * @param stopWords
 553    *          set of stopwords, if null it means to allow stop words
 554    *
 555    * @see org.apache.lucene.analysis.StopFilter#makeStopSet
 556    *      StopFilter.makeStopSet()
 557    * @see #getStopWords
 558    */
 559   public void setStopWords(Set<?> stopWords) {
 560     this.stopWords = stopWords;
 561   }
 562
 563   /**
 564    * Get the current stop words being used.
 565    *
 566    * @see #setStopWords
 567    */
 568   public Set<?> getStopWords() {
 569     return stopWords;
 570   }
 571
 572   /**
 573    * Returns the maximum number of query terms that will be included in any
 574    * generated query. The default is {@link #DEFAULT_MAX_QUERY_TERMS}.
 575    *
 576    * @return the maximum number of query terms that will be included in any
 577    *         generated query.
 578    */
 579   public int getMaxQueryTerms() {
 580     return maxQueryTerms;
 581   }
 582
 583   /**
 584    * Sets the maximum number of query terms that will be included in any
 585    * generated query.
 586    *
 587    * @param maxQueryTerms
 588    *          the maximum number of query terms that will be included in any
 589    *          generated query.
 590    */
 591   public void setMaxQueryTerms(int maxQueryTerms) {
 592     this.maxQueryTerms = maxQueryTerms;
 593   }
 594
 595   /**
 596    * @return The maximum number of tokens to parse in each example doc field
 597    *         that is not stored with TermVector support
 598    * @see #DEFAULT_MAX_NUM_TOKENS_PARSED
 599    */
 600   public int getMaxNumTokensParsed() {
 601     return maxNumTokensParsed;
 602   }
 603
 604   /**
 605    * @param i
 606    *          The maximum number of tokens to parse in each example doc field
 607    *          that is not stored with TermVector support
 608    */
 609   public void setMaxNumTokensParsed(int i) {
 610     maxNumTokensParsed = i;
 611   }
 612
 613   /**
 614    * Return a query that will return docs like the passed lucene document ID.
 615    *
 616    * @param docNum
 617    *          the documentID of the lucene doc to generate the 'More Like This"
 618    *          query for.
 619    * @return a query that will return docs like the passed lucene document ID.
 620    */
 621   public Query like(int docNum) throws IOException {
 622     if (fieldNames == null) {
 623       // gather list of valid fields from lucene
 624       Collection<String> fields = ir
 625           .getFieldNames(IndexReader.FieldOption.INDEXED);
 626       fieldNames = fields.toArray(new String[fields.size()]);
 627     }
 628
 629     return createQuery(retrieveTerms(docNum));
 630   }
 631
 632   /**
 633    * Return a query that will return docs like the passed file.
 634    *
 635    * @return a query that will return docs like the passed file.
 636    * @deprecated use {@link #like(Reader, String)} instead */
 637   @Deprecated
 638   public Query like(File f) throws IOException {
 639     if (fieldNames == null) {
 640       // gather list of valid fields from lucene
 641       Collection<String> fields = ir
 642           .getFieldNames(IndexReader.FieldOption.INDEXED);
 643       fieldNames = fields.toArray(new String[fields.size()]);
 644     }
 645
 646     return like(new FileReader(f));
 647   }
 648
 649   /**
 650    * Return a query that will return docs like the passed URL.
 651    *
 652    * @return a query that will return docs like the passed URL.
 653    * @deprecated use {@link #like(Reader, String)} instead */
 654   @Deprecated
 655   public Query like(URL u) throws IOException {
 656     return like(new InputStreamReader(u.openConnection().getInputStream()));
 657   }
 658
 659   /**
 660    * Return a query that will return docs like the passed stream.
 661    *
 662    * @return a query that will return docs like the passed stream.
 663    * @deprecated use {@link #like(Reader, String)} instead */
 664   @Deprecated
 665   public Query like(java.io.InputStream is) throws IOException {
 666     return like(new InputStreamReader(is));
 667   }
 668
 669   /** @deprecated use {@link #like(Reader, String)} instead */
 670   @Deprecated
 671   public Query like(Reader r) throws IOException {
 672     return createQuery(retrieveTerms(r, fieldNames[0]));
 673   }
 674
 675   /**
 676    * Return a query that will return docs like the passed Reader.
 677    *
 678    * @return a query that will return docs like the passed Reader.
 679    */
 680   public Query like(Reader r, String fieldName) throws IOException {
 681     return createQuery(retrieveTerms(r, fieldName));
 682   }
 683
 684   /**
 685    * Create the More like query from a PriorityQueue
 686    */
 687   private Query createQuery(PriorityQueue<Object[]> q) {
 688     BooleanQuery query = new BooleanQuery();
 689     Object cur;
 690     int qterms = 0;
 691     float bestScore = 0;
 692
 693     while (((cur = q.pop()) != null)) {
 694       Object[] ar = (Object[]) cur;
 695       TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0]));
 696
 697       if (boost) {
 698         if (qterms == 0) {
 699           bestScore = ((Float) ar[2]).floatValue();
 700         }
 701         float myScore = ((Float) ar[2]).floatValue();
 702
 703         tq.setBoost(boostFactor * myScore / bestScore);
 704       }
 705
 706       try {
 707         query.add(tq, BooleanClause.Occur.SHOULD);
 708       } catch (BooleanQuery.TooManyClauses ignore) {
 709         break;
 710       }
 711
 712       qterms++;
 713       if (maxQueryTerms > 0 && qterms >= maxQueryTerms) {
 714         break;
 715       }
 716     }
 717
 718     return query;
 719   }
 720
 721   /**
 722    * Create a PriorityQueue from a word->tf map.
 723    *
 724    * @param words
 725    *          a map of words keyed on the word(String) with Int objects as the
 726    *          values.
 727    */
 728   private PriorityQueue<Object[]> createQueue(Map<String,Int> words)
 729       throws IOException {
 730     // have collected all words in doc and their freqs
 731     int numDocs = ir.numDocs();
 732     FreqQ res = new FreqQ(words.size()); // will order words by score
 733
 734     Iterator<String> it = words.keySet().iterator();
 735     while (it.hasNext()) { // for every word
 736       String word = it.next();
 737
 738       int tf = words.get(word).x; // term freq in the source doc
 739       if (minTermFreq > 0 && tf < minTermFreq) {
 740         continue; // filter out words that don't occur enough times in the
 741                   // source
 742       }
 743
 744       // go through all the fields and find the largest document frequency
 745       String topField = fieldNames[0];
 746       int docFreq = 0;
 747       for (int i = 0; i < fieldNames.length; i++) {
 748         int freq = ir.docFreq(new Term(fieldNames[i], word));
 749         topField = (freq > docFreq) ? fieldNames[i] : topField;
 750         docFreq = (freq > docFreq) ? freq : docFreq;
 751       }
 752
 753       if (minDocFreq > 0 && docFreq < minDocFreq) {
 754         continue; // filter out words that don't occur in enough docs
 755       }
 756
 757       if (docFreq > maxDocFreq) {
 758         continue; // filter out words that occur in too many docs
 759       }
 760
 761       if (docFreq == 0) {
 762         continue; // index update problem?
 763       }
 764
 765       float idf = similarity.idf(docFreq, numDocs);
 766       float score = tf * idf;
 767
 768       // only really need 1st 3 entries, other ones are for troubleshooting
 769       res.insertWithOverflow(new Object[] {word, // the word
 770           topField, // the top field
 771           Float.valueOf(score), // overall score
 772           Float.valueOf(idf), // idf
 773           Integer.valueOf(docFreq), // freq in all docs
 774           Integer.valueOf(tf)});
 775     }
 776     return res;
 777   }
 778
 779   /**
 780    * Describe the parameters that control how the "more like this" query is
 781    * formed.
 782    */
 783   public String describeParams() {
 784     StringBuilder sb = new StringBuilder();
 785     sb.append("\t" + "maxQueryTerms  : " + maxQueryTerms + "\n");
 786     sb.append("\t" + "minWordLen     : " + minWordLen + "\n");
 787     sb.append("\t" + "maxWordLen     : " + maxWordLen + "\n");
 788     sb.append("\t" + "fieldNames     : ");
 789     String delim = "";
 790     for (int i = 0; i < fieldNames.length; i++) {
 791       String fieldName = fieldNames[i];
 792       sb.append(delim).append(fieldName);
 793       delim = ", ";
 794     }
 795     sb.append("\n");
 796     sb.append("\t" + "boost          : " + boost + "\n");
 797     sb.append("\t" + "minTermFreq    : " + minTermFreq + "\n");
 798     sb.append("\t" + "minDocFreq     : " + minDocFreq + "\n");
 799     return sb.toString();
 800   }
 801
 802   /**
 803    * Find words for a more-like-this query former.
 804    *
 805    * @param docNum
 806    *          the id of the lucene document from which to find terms
 807    */
 808   public PriorityQueue<Object[]> retrieveTerms(int docNum) throws IOException {
 809     Map<String,Int> termFreqMap = new HashMap<String,Int>();
 810     for (int i = 0; i < fieldNames.length; i++) {
 811       String fieldName = fieldNames[i];
 812       TermFreqVector vector = ir.getTermFreqVector(docNum, fieldName);
 813
 814       // field does not store term vector info
 815       if (vector == null) {
 816         Document d = ir.document(docNum);
 817         String text[] = d.getValues(fieldName);
 818         if (text != null) {
 819           for (int j = 0; j < text.length; j++) {
 820             addTermFrequencies(new StringReader(text[j]), termFreqMap,
 821                 fieldName);
 822           }
 823         }
 824       } else {
 825         addTermFrequencies(termFreqMap, vector);
 826       }
 827
 828     }
 829
 830     return createQueue(termFreqMap);
 831   }
 832
 833   /**
 834    * Adds terms and frequencies found in vector into the Map termFreqMap
 835    *
 836    * @param termFreqMap
 837    *          a Map of terms and their frequencies
 838    * @param vector
 839    *          List of terms and their frequencies for a doc/field
 840    */
 841   private void addTermFrequencies(Map<String,Int> termFreqMap,
 842       TermFreqVector vector) {
 843     String[] terms = vector.getTerms();
 844     int freqs[] = vector.getTermFrequencies();
 845     for (int j = 0; j < terms.length; j++) {
 846       String term = terms[j];
 847
 848       if (isNoiseWord(term)) {
 849         continue;
 850       }
 851       // increment frequency
 852       Int cnt = termFreqMap.get(term);
 853       if (cnt == null) {
 854         cnt = new Int();
 855         termFreqMap.put(term, cnt);
 856         cnt.x = freqs[j];
 857       } else {
 858         cnt.x += freqs[j];
 859       }
 860     }
 861   }
 862
 863   /**
 864    * Adds term frequencies found by tokenizing text from reader into the Map
 865    * words
 866    *
 867    * @param r
 868    *          a source of text to be tokenized
 869    * @param termFreqMap
 870    *          a Map of terms and their frequencies
 871    * @param fieldName
 872    *          Used by analyzer for any special per-field analysis
 873    */
 874   private void addTermFrequencies(Reader r, Map<String,Int> termFreqMap,
 875       String fieldName) throws IOException {
 876     TokenStream ts = analyzer.reusableTokenStream(fieldName, r);
 877     int tokenCount = 0;
 878     // for every token
 879     CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
 880     ts.reset();
 881     while (ts.incrementToken()) {
 882       String word = termAtt.toString();
 883       tokenCount++;
 884       if (tokenCount > maxNumTokensParsed) {
 885         break;
 886       }
 887       if (isNoiseWord(word)) {
 888         continue;
 889       }
 890
 891       // increment frequency
 892       Int cnt = termFreqMap.get(word);
 893       if (cnt == null) {
 894         termFreqMap.put(word, new Int());
 895       } else {
 896         cnt.x++;
 897       }
 898     }
 899     ts.end();
 900     ts.close();
 901   }
 902
 903   /**
 904    * determines if the passed term is likely to be of interest in "more like"
 905    * comparisons
 906    *
 907    * @param term
 908    *          The word being considered
 909    * @return true if should be ignored, false if should be used in further
 910    *         analysis
 911    */
 912   private boolean isNoiseWord(String term) {
 913     int len = term.length();
 914     if (minWordLen > 0 && len < minWordLen) {
 915       return true;
 916     }
 917     if (maxWordLen > 0 && len > maxWordLen) {
 918       return true;
 919     }
 920     if (stopWords != null && stopWords.contains(term)) {
 921       return true;
 922     }
 923     return false;
 924   }
 925
 926   /**
 927    * Find words for a more-like-this query former. The result is a priority
 928    * queue of arrays with one entry for <b>every word</b> in the document. Each
 929    * array has 6 elements. The elements are:
 930    * <ol>
 931    * <li>The word (String)
 932    * <li>The top field that this word comes from (String)
 933    * <li>The score for this word (Float)
 934    * <li>The IDF value (Float)
 935    * <li>The frequency of this word in the index (Integer)
 936    * <li>The frequency of this word in the source document (Integer)
 937    * </ol>
 938    * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
 939    * This method is exposed so that you can identify the "interesting words" in a document.
 940    * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
 941    *
 942    * @param r the reader that has the content of the document
 943    * @param fieldName field passed to the analyzer to use when analyzing the content
 944    * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
 945    * @see #retrieveInterestingTerms
 946    */
 947   public PriorityQueue<Object[]> retrieveTerms(Reader r, String fieldName) throws IOException {
 948     Map<String, Int> words = new HashMap<String, Int>();
 949     addTermFrequencies(r, words, fieldName);
 950     return createQueue(words);
 951   }
 952
 953   /** @deprecated use {@link #retrieveTerms(Reader, String)} instead */
 954   @Deprecated
 955   public PriorityQueue<Object[]> retrieveTerms(Reader r) throws IOException {
 956     return retrieveTerms(r, fieldNames[0]);
 957   }
 958
 959   /**
 960    * @see #retrieveInterestingTerms(java.io.Reader, String)
 961    */
 962   public String[] retrieveInterestingTerms(int docNum) throws IOException {
 963     ArrayList<Object> al = new ArrayList<Object>(maxQueryTerms);
 964     PriorityQueue<Object[]> pq = retrieveTerms(docNum);
 965     Object cur;
 966     int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all
 967                              // words but that's probably not useful to our
 968                              // caller...
 969     // we just want to return the top words
 970     while (((cur = pq.pop()) != null) && lim-- > 0) {
 971       Object[] ar = (Object[]) cur;
 972       al.add(ar[0]); // the 1st entry is the interesting word
 973     }
 974     String[] res = new String[al.size()];
 975     return al.toArray(res);
 976   }
 977
 978   /**
 979    * Convenience routine to make it easy to return the most interesting words in a document.
 980    * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly.
 981    *
 982    * @param r the source document
 983    * @param fieldName field passed to analyzer to use when analyzing the content
 984    * @return the most interesting words in the document
 985    * @see #retrieveTerms(java.io.Reader, String)
 986    * @see #setMaxQueryTerms
 987    */
 988   public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
 989     ArrayList<Object> al = new ArrayList<Object>(maxQueryTerms);
 990     PriorityQueue<Object[]> pq = retrieveTerms(r, fieldName);
 991     Object cur;
 992     int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all
 993                              // words but that's probably not useful to our
 994                              // caller...
 995     // we just want to return the top words
 996     while (((cur = pq.pop()) != null) && lim-- > 0) {
 997       Object[] ar = (Object[]) cur;
 998       al.add(ar[0]); // the 1st entry is the interesting word
 999     }
1000     String[] res = new String[al.size()];
1001     return al.toArray(res);
1002   }
1003
1004   /** @deprecated use {@link #retrieveInterestingTerms(Reader, String)} instead. */
1005   @Deprecated
1006   public String[] retrieveInterestingTerms(Reader r) throws IOException {
1007     return retrieveInterestingTerms(r, fieldNames[0]);
1008   }
1009
1010   /**
1011    * PriorityQueue that orders words by score.
1012    */
1013   private static class FreqQ extends PriorityQueue<Object[]> {
1014     FreqQ(int s) {
1015       initialize(s);
1016     }
1017
1018     @Override
1019     protected boolean lessThan(Object[] aa, Object[] bb) {
1020       Float fa = (Float) aa[2];
1021       Float fb = (Float) bb[2];
1022       return fa.floatValue() > fb.floatValue();
1023     }
1024   }
1025
1026   /**
1027    * Use for frequencies and to avoid renewing Integers.
1028    */
1029   private static class Int {
1030     int x;
1031
1032     Int() {
1033       x = 1;
1034     }
1035   }
1036
1037 }