2 * Created on 25-Jan-2006
4 package org.apache.lucene.xmlparser.builders;
6 import java.io.IOException;
7 import java.io.StringReader;
8 import java.util.HashSet;
11 import org.apache.lucene.analysis.Analyzer;
12 import org.apache.lucene.analysis.TokenStream;
13 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
14 import org.apache.lucene.search.similar.MoreLikeThisQuery;
15 import org.apache.lucene.search.Query;
16 import org.apache.lucene.xmlparser.DOMUtils;
17 import org.apache.lucene.xmlparser.ParserException;
18 import org.apache.lucene.xmlparser.QueryBuilder;
19 import org.w3c.dom.Element;
21 * Licensed to the Apache Software Foundation (ASF) under one or more
22 * contributor license agreements. See the NOTICE file distributed with
23 * this work for additional information regarding copyright ownership.
24 * The ASF licenses this file to You under the Apache License, Version 2.0
25 * (the "License"); you may not use this file except in compliance with
26 * the License. You may obtain a copy of the License at
28 * http://www.apache.org/licenses/LICENSE-2.0
30 * Unless required by applicable law or agreed to in writing, software
31 * distributed under the License is distributed on an "AS IS" BASIS,
32 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
33 * See the License for the specific language governing permissions and
34 * limitations under the License.
40 public class LikeThisQueryBuilder implements QueryBuilder {
42 private Analyzer analyzer;
43 String defaultFieldNames [];
44 int defaultMaxQueryTerms=20;
45 int defaultMinTermFrequency=1;
46 float defaultPercentTermsToMatch=30; //default is a 3rd of selected terms must match
48 public LikeThisQueryBuilder(Analyzer analyzer,String [] defaultFieldNames)
50 this.analyzer=analyzer;
51 this.defaultFieldNames=defaultFieldNames;
55 * @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element)
57 public Query getQuery(Element e) throws ParserException {
58 String fieldsList=e.getAttribute("fieldNames"); //a comma-delimited list of fields
59 String fields[]=defaultFieldNames;
60 if((fieldsList!=null)&&(fieldsList.trim().length()>0))
62 fields=fieldsList.trim().split(",");
64 for (int i = 0; i < fields.length; i++) {
65 fields[i]=fields[i].trim();
69 //Parse any "stopWords" attribute
70 //TODO MoreLikeThis needs to ideally have per-field stopWords lists - until then
71 //I use all analyzers/fields to generate multi-field compatible stop list
72 String stopWords=e.getAttribute("stopWords");
73 Set<String> stopWordsSet=null;
74 if((stopWords!=null)&&(fields!=null))
76 stopWordsSet=new HashSet<String>();
77 for (int i = 0; i < fields.length; i++)
81 TokenStream ts = analyzer.reusableTokenStream(fields[i],new StringReader(stopWords));
82 CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
84 while(ts.incrementToken()) {
85 stopWordsSet.add(termAtt.toString());
90 catch(IOException ioe)
92 throw new ParserException("IoException parsing stop words list in "
93 +getClass().getName()+":"+ioe.getLocalizedMessage());
99 MoreLikeThisQuery mlt=new MoreLikeThisQuery(DOMUtils.getText(e),fields,analyzer, fields[0]);
100 mlt.setMaxQueryTerms(DOMUtils.getAttribute(e,"maxQueryTerms",defaultMaxQueryTerms));
101 mlt.setMinTermFrequency(DOMUtils.getAttribute(e,"minTermFrequency",defaultMinTermFrequency));
102 mlt.setPercentTermsToMatch(DOMUtils.getAttribute(e,"percentTermsToMatch",defaultPercentTermsToMatch)/100);
103 mlt.setStopWords(stopWordsSet);
104 int minDocFreq=DOMUtils.getAttribute(e,"minDocFreq",-1);
107 mlt.setMinDocFreq(minDocFreq);
110 mlt.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));