1 package org.apache.lucene.search;
3 * Licensed to the Apache Software Foundation (ASF) under one or more
4 * contributor license agreements. See the NOTICE file distributed with
5 * this work for additional information regarding copyright ownership.
6 * The ASF licenses this file to You under the Apache License, Version 2.0
7 * (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
18 import java.io.IOException;
20 import org.apache.lucene.index.IndexReader;
21 import org.apache.lucene.index.Term;
22 import org.apache.lucene.index.TermDocs;
23 import org.apache.lucene.index.TermEnum;
24 import org.apache.lucene.util.FixedBitSet;
26 public class DuplicateFilter extends Filter
32 * KeepMode determines which document id to consider as the master, all others being
33 * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
35 int keepMode=KM_USE_FIRST_OCCURRENCE;
36 public static final int KM_USE_FIRST_OCCURRENCE=1;
37 public static final int KM_USE_LAST_OCCURRENCE=2;
40 * "Full" processing mode starts by setting all bits to false and only setting bits
41 * for documents that contain the given field and are identified as none-duplicates.
43 * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
44 * given field. This approach avoids the need to read TermDocs for terms that are seen
45 * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
46 * faster approach , the downside is that bitsets produced will include bits set for
47 * documents that do not actually contain the field given.
50 int processingMode=PM_FULL_VALIDATION;
51 public static final int PM_FULL_VALIDATION=1;
52 public static final int PM_FAST_INVALIDATION=2;
56 public DuplicateFilter(String fieldName)
58 this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
62 public DuplicateFilter(String fieldName, int keepMode, int processingMode)
64 this.fieldName = fieldName;
65 this.keepMode = keepMode;
66 this.processingMode = processingMode;
70 public DocIdSet getDocIdSet(IndexReader reader) throws IOException
72 if(processingMode==PM_FAST_INVALIDATION)
74 return fastBits(reader);
78 return correctBits(reader);
82 private FixedBitSet correctBits(IndexReader reader) throws IOException
85 FixedBitSet bits=new FixedBitSet(reader.maxDoc()); //assume all are INvalid
86 Term startTerm=new Term(fieldName);
87 TermEnum te = reader.terms(startTerm);
90 Term currTerm=te.term();
91 while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
95 TermDocs td = reader.termDocs(currTerm);
98 if(keepMode==KM_USE_FIRST_OCCURRENCE)
121 private FixedBitSet fastBits(IndexReader reader) throws IOException
124 FixedBitSet bits=new FixedBitSet(reader.maxDoc());
125 bits.set(0,reader.maxDoc()); //assume all are valid
126 Term startTerm=new Term(fieldName);
127 TermEnum te = reader.terms(startTerm);
130 Term currTerm=te.term();
132 while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
137 //unset potential duplicates
138 TermDocs td = reader.termDocs(currTerm);
140 if(keepMode==KM_USE_FIRST_OCCURRENCE)
149 if(keepMode==KM_USE_LAST_OCCURRENCE)
151 //restore the last bit
165 public String getFieldName()
171 public void setFieldName(String fieldName)
173 this.fieldName = fieldName;
177 public int getKeepMode()
183 public void setKeepMode(int keepMode)
185 this.keepMode = keepMode;
190 public boolean equals(Object obj)
194 if((obj == null) || (obj.getClass() != this.getClass()))
196 DuplicateFilter other = (DuplicateFilter)obj;
197 return keepMode == other.keepMode &&
198 processingMode == other.processingMode &&
199 (fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
205 public int hashCode()
208 hash = 31 * hash + keepMode;
209 hash = 31 * hash + processingMode;
210 hash = 31 * hash + fieldName.hashCode();
215 public int getProcessingMode()
217 return processingMode;
221 public void setProcessingMode(int processingMode)
223 this.processingMode = processingMode;