X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java diff --git a/lucene-java-3.5.0/lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java b/lucene-java-3.5.0/lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java new file mode 100644 index 0000000..957ab20 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java @@ -0,0 +1,228 @@ +package org.apache.lucene.search; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.util.FixedBitSet; + +public class DuplicateFilter extends Filter +{ + + String fieldName; + + /** + * KeepMode determines which document id to consider as the master, all others being + * identified as duplicates. Selecting the "first occurrence" can potentially save on IO. + */ + int keepMode=KM_USE_FIRST_OCCURRENCE; + public static final int KM_USE_FIRST_OCCURRENCE=1; + public static final int KM_USE_LAST_OCCURRENCE=2; + + /** + * "Full" processing mode starts by setting all bits to false and only setting bits + * for documents that contain the given field and are identified as none-duplicates. + + * "Fast" processing sets all bits to true then unsets all duplicate docs found for the + * given field. This approach avoids the need to read TermDocs for terms that are seen + * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially + * faster approach , the downside is that bitsets produced will include bits set for + * documents that do not actually contain the field given. + * + */ + int processingMode=PM_FULL_VALIDATION; + public static final int PM_FULL_VALIDATION=1; + public static final int PM_FAST_INVALIDATION=2; + + + + public DuplicateFilter(String fieldName) + { + this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION); + } + + + public DuplicateFilter(String fieldName, int keepMode, int processingMode) + { + this.fieldName = fieldName; + this.keepMode = keepMode; + this.processingMode = processingMode; + } + + @Override + public DocIdSet getDocIdSet(IndexReader reader) throws IOException + { + if(processingMode==PM_FAST_INVALIDATION) + { + return fastBits(reader); + } + else + { + return correctBits(reader); + } + } + + private FixedBitSet correctBits(IndexReader reader) throws IOException + { + + FixedBitSet bits=new FixedBitSet(reader.maxDoc()); //assume all are INvalid + Term startTerm=new Term(fieldName); + TermEnum te = reader.terms(startTerm); + if(te!=null) + { + Term currTerm=te.term(); + while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned + { + int lastDoc=-1; + //set non duplicates + TermDocs td = reader.termDocs(currTerm); + if(td.next()) + { + if(keepMode==KM_USE_FIRST_OCCURRENCE) + { + bits.set(td.doc()); + } + else + { + do + { + lastDoc=td.doc(); + }while(td.next()); + bits.set(lastDoc); + } + } + if(!te.next()) + { + break; + } + currTerm=te.term(); + } + } + return bits; + } + + private FixedBitSet fastBits(IndexReader reader) throws IOException + { + + FixedBitSet bits=new FixedBitSet(reader.maxDoc()); + bits.set(0,reader.maxDoc()); //assume all are valid + Term startTerm=new Term(fieldName); + TermEnum te = reader.terms(startTerm); + if(te!=null) + { + Term currTerm=te.term(); + + while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned + { + if(te.docFreq()>1) + { + int lastDoc=-1; + //unset potential duplicates + TermDocs td = reader.termDocs(currTerm); + td.next(); + if(keepMode==KM_USE_FIRST_OCCURRENCE) + { + td.next(); + } + do + { + lastDoc=td.doc(); + bits.clear(lastDoc); + }while(td.next()); + if(keepMode==KM_USE_LAST_OCCURRENCE) + { + //restore the last bit + bits.set(lastDoc); + } + } + if(!te.next()) + { + break; + } + currTerm=te.term(); + } + } + return bits; + } + + public String getFieldName() + { + return fieldName; + } + + + public void setFieldName(String fieldName) + { + this.fieldName = fieldName; + } + + + public int getKeepMode() + { + return keepMode; + } + + + public void setKeepMode(int keepMode) + { + this.keepMode = keepMode; + } + + + @Override + public boolean equals(Object obj) + { + if(this == obj) + return true; + if((obj == null) || (obj.getClass() != this.getClass())) + return false; + DuplicateFilter other = (DuplicateFilter)obj; + return keepMode == other.keepMode && + processingMode == other.processingMode && + (fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName))); + } + + + + @Override + public int hashCode() + { + int hash = 217; + hash = 31 * hash + keepMode; + hash = 31 * hash + processingMode; + hash = 31 * hash + fieldName.hashCode(); + return hash; + } + + + public int getProcessingMode() + { + return processingMode; + } + + + public void setProcessingMode(int processingMode) + { + this.processingMode = processingMode; + } + + + +}