+++ /dev/null
-package org.apache.lucene.search;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-import java.io.IOException;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermDocs;
-import org.apache.lucene.index.TermEnum;
-import org.apache.lucene.util.FixedBitSet;
-
-public class DuplicateFilter extends Filter
-{
-
- String fieldName;
-
- /**
- * KeepMode determines which document id to consider as the master, all others being
- * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
- */
- int keepMode=KM_USE_FIRST_OCCURRENCE;
- public static final int KM_USE_FIRST_OCCURRENCE=1;
- public static final int KM_USE_LAST_OCCURRENCE=2;
-
- /**
- * "Full" processing mode starts by setting all bits to false and only setting bits
- * for documents that contain the given field and are identified as none-duplicates.
-
- * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
- * given field. This approach avoids the need to read TermDocs for terms that are seen
- * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
- * faster approach , the downside is that bitsets produced will include bits set for
- * documents that do not actually contain the field given.
- *
- */
- int processingMode=PM_FULL_VALIDATION;
- public static final int PM_FULL_VALIDATION=1;
- public static final int PM_FAST_INVALIDATION=2;
-
-
-
- public DuplicateFilter(String fieldName)
- {
- this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
- }
-
-
- public DuplicateFilter(String fieldName, int keepMode, int processingMode)
- {
- this.fieldName = fieldName;
- this.keepMode = keepMode;
- this.processingMode = processingMode;
- }
-
- @Override
- public DocIdSet getDocIdSet(IndexReader reader) throws IOException
- {
- if(processingMode==PM_FAST_INVALIDATION)
- {
- return fastBits(reader);
- }
- else
- {
- return correctBits(reader);
- }
- }
-
- private FixedBitSet correctBits(IndexReader reader) throws IOException
- {
-
- FixedBitSet bits=new FixedBitSet(reader.maxDoc()); //assume all are INvalid
- Term startTerm=new Term(fieldName);
- TermEnum te = reader.terms(startTerm);
- if(te!=null)
- {
- Term currTerm=te.term();
- while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
- {
- int lastDoc=-1;
- //set non duplicates
- TermDocs td = reader.termDocs(currTerm);
- if(td.next())
- {
- if(keepMode==KM_USE_FIRST_OCCURRENCE)
- {
- bits.set(td.doc());
- }
- else
- {
- do
- {
- lastDoc=td.doc();
- }while(td.next());
- bits.set(lastDoc);
- }
- }
- if(!te.next())
- {
- break;
- }
- currTerm=te.term();
- }
- }
- return bits;
- }
-
- private FixedBitSet fastBits(IndexReader reader) throws IOException
- {
-
- FixedBitSet bits=new FixedBitSet(reader.maxDoc());
- bits.set(0,reader.maxDoc()); //assume all are valid
- Term startTerm=new Term(fieldName);
- TermEnum te = reader.terms(startTerm);
- if(te!=null)
- {
- Term currTerm=te.term();
-
- while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
- {
- if(te.docFreq()>1)
- {
- int lastDoc=-1;
- //unset potential duplicates
- TermDocs td = reader.termDocs(currTerm);
- td.next();
- if(keepMode==KM_USE_FIRST_OCCURRENCE)
- {
- td.next();
- }
- do
- {
- lastDoc=td.doc();
- bits.clear(lastDoc);
- }while(td.next());
- if(keepMode==KM_USE_LAST_OCCURRENCE)
- {
- //restore the last bit
- bits.set(lastDoc);
- }
- }
- if(!te.next())
- {
- break;
- }
- currTerm=te.term();
- }
- }
- return bits;
- }
-
- public String getFieldName()
- {
- return fieldName;
- }
-
-
- public void setFieldName(String fieldName)
- {
- this.fieldName = fieldName;
- }
-
-
- public int getKeepMode()
- {
- return keepMode;
- }
-
-
- public void setKeepMode(int keepMode)
- {
- this.keepMode = keepMode;
- }
-
-
- @Override
- public boolean equals(Object obj)
- {
- if(this == obj)
- return true;
- if((obj == null) || (obj.getClass() != this.getClass()))
- return false;
- DuplicateFilter other = (DuplicateFilter)obj;
- return keepMode == other.keepMode &&
- processingMode == other.processingMode &&
- (fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
- }
-
-
-
- @Override
- public int hashCode()
- {
- int hash = 217;
- hash = 31 * hash + keepMode;
- hash = 31 * hash + processingMode;
- hash = 31 * hash + fieldName.hashCode();
- return hash;
- }
-
-
- public int getProcessingMode()
- {
- return processingMode;
- }
-
-
- public void setProcessingMode(int processingMode)
- {
- this.processingMode = processingMode;
- }
-
-
-
-}