X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java?ds=sidebyside diff --git a/lucene-java-3.4.0/lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java b/lucene-java-3.4.0/lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java deleted file mode 100644 index 957ab20..0000000 --- a/lucene-java-3.4.0/lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java +++ /dev/null @@ -1,228 +0,0 @@ -package org.apache.lucene.search; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -import java.io.IOException; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermDocs; -import org.apache.lucene.index.TermEnum; -import org.apache.lucene.util.FixedBitSet; - -public class DuplicateFilter extends Filter -{ - - String fieldName; - - /** - * KeepMode determines which document id to consider as the master, all others being - * identified as duplicates. Selecting the "first occurrence" can potentially save on IO. - */ - int keepMode=KM_USE_FIRST_OCCURRENCE; - public static final int KM_USE_FIRST_OCCURRENCE=1; - public static final int KM_USE_LAST_OCCURRENCE=2; - - /** - * "Full" processing mode starts by setting all bits to false and only setting bits - * for documents that contain the given field and are identified as none-duplicates. - - * "Fast" processing sets all bits to true then unsets all duplicate docs found for the - * given field. This approach avoids the need to read TermDocs for terms that are seen - * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially - * faster approach , the downside is that bitsets produced will include bits set for - * documents that do not actually contain the field given. - * - */ - int processingMode=PM_FULL_VALIDATION; - public static final int PM_FULL_VALIDATION=1; - public static final int PM_FAST_INVALIDATION=2; - - - - public DuplicateFilter(String fieldName) - { - this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION); - } - - - public DuplicateFilter(String fieldName, int keepMode, int processingMode) - { - this.fieldName = fieldName; - this.keepMode = keepMode; - this.processingMode = processingMode; - } - - @Override - public DocIdSet getDocIdSet(IndexReader reader) throws IOException - { - if(processingMode==PM_FAST_INVALIDATION) - { - return fastBits(reader); - } - else - { - return correctBits(reader); - } - } - - private FixedBitSet correctBits(IndexReader reader) throws IOException - { - - FixedBitSet bits=new FixedBitSet(reader.maxDoc()); //assume all are INvalid - Term startTerm=new Term(fieldName); - TermEnum te = reader.terms(startTerm); - if(te!=null) - { - Term currTerm=te.term(); - while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned - { - int lastDoc=-1; - //set non duplicates - TermDocs td = reader.termDocs(currTerm); - if(td.next()) - { - if(keepMode==KM_USE_FIRST_OCCURRENCE) - { - bits.set(td.doc()); - } - else - { - do - { - lastDoc=td.doc(); - }while(td.next()); - bits.set(lastDoc); - } - } - if(!te.next()) - { - break; - } - currTerm=te.term(); - } - } - return bits; - } - - private FixedBitSet fastBits(IndexReader reader) throws IOException - { - - FixedBitSet bits=new FixedBitSet(reader.maxDoc()); - bits.set(0,reader.maxDoc()); //assume all are valid - Term startTerm=new Term(fieldName); - TermEnum te = reader.terms(startTerm); - if(te!=null) - { - Term currTerm=te.term(); - - while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned - { - if(te.docFreq()>1) - { - int lastDoc=-1; - //unset potential duplicates - TermDocs td = reader.termDocs(currTerm); - td.next(); - if(keepMode==KM_USE_FIRST_OCCURRENCE) - { - td.next(); - } - do - { - lastDoc=td.doc(); - bits.clear(lastDoc); - }while(td.next()); - if(keepMode==KM_USE_LAST_OCCURRENCE) - { - //restore the last bit - bits.set(lastDoc); - } - } - if(!te.next()) - { - break; - } - currTerm=te.term(); - } - } - return bits; - } - - public String getFieldName() - { - return fieldName; - } - - - public void setFieldName(String fieldName) - { - this.fieldName = fieldName; - } - - - public int getKeepMode() - { - return keepMode; - } - - - public void setKeepMode(int keepMode) - { - this.keepMode = keepMode; - } - - - @Override - public boolean equals(Object obj) - { - if(this == obj) - return true; - if((obj == null) || (obj.getClass() != this.getClass())) - return false; - DuplicateFilter other = (DuplicateFilter)obj; - return keepMode == other.keepMode && - processingMode == other.processingMode && - (fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName))); - } - - - - @Override - public int hashCode() - { - int hash = 217; - hash = 31 * hash + keepMode; - hash = 31 * hash + processingMode; - hash = 31 * hash + fieldName.hashCode(); - return hash; - } - - - public int getProcessingMode() - { - return processingMode; - } - - - public void setProcessingMode(int processingMode) - { - this.processingMode = processingMode; - } - - - -}