--- /dev/null
+package org.apache.lucene.search;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.util.FixedBitSet;
+
+public class DuplicateFilter extends Filter
+{
+
+ String fieldName;
+
+ /**
+ * KeepMode determines which document id to consider as the master, all others being
+ * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
+ */
+ int keepMode=KM_USE_FIRST_OCCURRENCE;
+ public static final int KM_USE_FIRST_OCCURRENCE=1;
+ public static final int KM_USE_LAST_OCCURRENCE=2;
+
+ /**
+ * "Full" processing mode starts by setting all bits to false and only setting bits
+ * for documents that contain the given field and are identified as none-duplicates.
+
+ * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
+ * given field. This approach avoids the need to read TermDocs for terms that are seen
+ * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
+ * faster approach , the downside is that bitsets produced will include bits set for
+ * documents that do not actually contain the field given.
+ *
+ */
+ int processingMode=PM_FULL_VALIDATION;
+ public static final int PM_FULL_VALIDATION=1;
+ public static final int PM_FAST_INVALIDATION=2;
+
+
+
+ public DuplicateFilter(String fieldName)
+ {
+ this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
+ }
+
+
+ public DuplicateFilter(String fieldName, int keepMode, int processingMode)
+ {
+ this.fieldName = fieldName;
+ this.keepMode = keepMode;
+ this.processingMode = processingMode;
+ }
+
+ @Override
+ public DocIdSet getDocIdSet(IndexReader reader) throws IOException
+ {
+ if(processingMode==PM_FAST_INVALIDATION)
+ {
+ return fastBits(reader);
+ }
+ else
+ {
+ return correctBits(reader);
+ }
+ }
+
+ private FixedBitSet correctBits(IndexReader reader) throws IOException
+ {
+
+ FixedBitSet bits=new FixedBitSet(reader.maxDoc()); //assume all are INvalid
+ Term startTerm=new Term(fieldName);
+ TermEnum te = reader.terms(startTerm);
+ if(te!=null)
+ {
+ Term currTerm=te.term();
+ while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
+ {
+ int lastDoc=-1;
+ //set non duplicates
+ TermDocs td = reader.termDocs(currTerm);
+ if(td.next())
+ {
+ if(keepMode==KM_USE_FIRST_OCCURRENCE)
+ {
+ bits.set(td.doc());
+ }
+ else
+ {
+ do
+ {
+ lastDoc=td.doc();
+ }while(td.next());
+ bits.set(lastDoc);
+ }
+ }
+ if(!te.next())
+ {
+ break;
+ }
+ currTerm=te.term();
+ }
+ }
+ return bits;
+ }
+
+ private FixedBitSet fastBits(IndexReader reader) throws IOException
+ {
+
+ FixedBitSet bits=new FixedBitSet(reader.maxDoc());
+ bits.set(0,reader.maxDoc()); //assume all are valid
+ Term startTerm=new Term(fieldName);
+ TermEnum te = reader.terms(startTerm);
+ if(te!=null)
+ {
+ Term currTerm=te.term();
+
+ while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
+ {
+ if(te.docFreq()>1)
+ {
+ int lastDoc=-1;
+ //unset potential duplicates
+ TermDocs td = reader.termDocs(currTerm);
+ td.next();
+ if(keepMode==KM_USE_FIRST_OCCURRENCE)
+ {
+ td.next();
+ }
+ do
+ {
+ lastDoc=td.doc();
+ bits.clear(lastDoc);
+ }while(td.next());
+ if(keepMode==KM_USE_LAST_OCCURRENCE)
+ {
+ //restore the last bit
+ bits.set(lastDoc);
+ }
+ }
+ if(!te.next())
+ {
+ break;
+ }
+ currTerm=te.term();
+ }
+ }
+ return bits;
+ }
+
+ public String getFieldName()
+ {
+ return fieldName;
+ }
+
+
+ public void setFieldName(String fieldName)
+ {
+ this.fieldName = fieldName;
+ }
+
+
+ public int getKeepMode()
+ {
+ return keepMode;
+ }
+
+
+ public void setKeepMode(int keepMode)
+ {
+ this.keepMode = keepMode;
+ }
+
+
+ @Override
+ public boolean equals(Object obj)
+ {
+ if(this == obj)
+ return true;
+ if((obj == null) || (obj.getClass() != this.getClass()))
+ return false;
+ DuplicateFilter other = (DuplicateFilter)obj;
+ return keepMode == other.keepMode &&
+ processingMode == other.processingMode &&
+ (fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
+ }
+
+
+
+ @Override
+ public int hashCode()
+ {
+ int hash = 217;
+ hash = 31 * hash + keepMode;
+ hash = 31 * hash + processingMode;
+ hash = 31 * hash + fieldName.hashCode();
+ return hash;
+ }
+
+
+ public int getProcessingMode()
+ {
+ return processingMode;
+ }
+
+
+ public void setProcessingMode(int processingMode)
+ {
+ this.processingMode = processingMode;
+ }
+
+
+
+}