add --shared
[pylucene.git] / lucene-java-3.4.0 / lucene / contrib / queries / src / java / org / apache / lucene / search / DuplicateFilter.java
1 package org.apache.lucene.search;
2 /**
3  * Licensed to the Apache Software Foundation (ASF) under one or more
4  * contributor license agreements.  See the NOTICE file distributed with
5  * this work for additional information regarding copyright ownership.
6  * The ASF licenses this file to You under the Apache License, Version 2.0
7  * (the "License"); you may not use this file except in compliance with
8  * the License.  You may obtain a copy of the License at
9  *
10  *     http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 import java.io.IOException;
19
20 import org.apache.lucene.index.IndexReader;
21 import org.apache.lucene.index.Term;
22 import org.apache.lucene.index.TermDocs;
23 import org.apache.lucene.index.TermEnum;
24 import org.apache.lucene.util.FixedBitSet;
25
26 public class DuplicateFilter extends Filter
27 {
28         
29         String fieldName;
30         
31         /**
32          * KeepMode determines which document id to consider as the master, all others being 
33          * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
34          */
35         int keepMode=KM_USE_FIRST_OCCURRENCE;
36         public static final int KM_USE_FIRST_OCCURRENCE=1;
37         public static final int KM_USE_LAST_OCCURRENCE=2;
38         
39         /**
40          * "Full" processing mode starts by setting all bits to false and only setting bits
41          * for documents that contain the given field and are identified as none-duplicates. 
42
43          * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
44          * given field. This approach avoids the need to read TermDocs for terms that are seen 
45          * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially 
46          * faster approach , the downside is that bitsets produced will include bits set for 
47          * documents that do not actually contain the field given.
48          * 
49          */
50         int processingMode=PM_FULL_VALIDATION;
51         public static final int PM_FULL_VALIDATION=1;
52         public static final int PM_FAST_INVALIDATION=2;
53         
54
55         
56         public DuplicateFilter(String fieldName)
57         {
58                 this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
59         }
60         
61
62         public DuplicateFilter(String fieldName, int keepMode, int processingMode)
63         {
64                 this.fieldName = fieldName;
65                 this.keepMode = keepMode;
66                 this.processingMode = processingMode;
67         }
68
69   @Override
70   public DocIdSet getDocIdSet(IndexReader reader) throws IOException
71         {
72                 if(processingMode==PM_FAST_INVALIDATION)
73                 {
74                         return fastBits(reader);
75                 }
76                 else
77                 {
78                         return correctBits(reader);
79                 }
80         }
81         
82   private FixedBitSet correctBits(IndexReader reader) throws IOException
83         {
84                 
85     FixedBitSet bits=new FixedBitSet(reader.maxDoc()); //assume all are INvalid
86                 Term startTerm=new Term(fieldName);
87                 TermEnum te = reader.terms(startTerm);
88                 if(te!=null)
89                 {
90                         Term currTerm=te.term();
91                         while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
92                         {
93                                 int lastDoc=-1;
94                                 //set non duplicates
95                                 TermDocs td = reader.termDocs(currTerm);
96                                 if(td.next())
97                                 {
98                                         if(keepMode==KM_USE_FIRST_OCCURRENCE)
99                                         {
100                                                 bits.set(td.doc());
101                                         }
102                                         else
103                                         {
104                                                 do
105                                                 {
106                                                         lastDoc=td.doc();
107                                                 }while(td.next());
108                                                 bits.set(lastDoc);
109                                         }
110                                 }
111                                 if(!te.next())
112                                 {
113                                         break;
114                                 }
115                                 currTerm=te.term();
116                         }
117                 }
118                 return bits;
119         }
120         
121   private FixedBitSet fastBits(IndexReader reader) throws IOException
122         {
123                 
124     FixedBitSet bits=new FixedBitSet(reader.maxDoc());
125                 bits.set(0,reader.maxDoc()); //assume all are valid
126                 Term startTerm=new Term(fieldName);
127                 TermEnum te = reader.terms(startTerm);
128                 if(te!=null)
129                 {
130                         Term currTerm=te.term();
131                         
132                         while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
133                         {
134                                 if(te.docFreq()>1)
135                                 {
136                                         int lastDoc=-1;
137                                         //unset potential duplicates
138                                         TermDocs td = reader.termDocs(currTerm);
139                                         td.next();
140                                         if(keepMode==KM_USE_FIRST_OCCURRENCE)
141                                         {
142                                                 td.next();
143                                         }
144                                         do
145                                         {
146                                                 lastDoc=td.doc();
147             bits.clear(lastDoc);
148                                         }while(td.next());
149                                         if(keepMode==KM_USE_LAST_OCCURRENCE)
150                                         {
151                                                 //restore the last bit
152                                                 bits.set(lastDoc);
153                                         }                                       
154                                 }
155                                 if(!te.next())
156                                 {
157                                         break;
158                                 }
159                                 currTerm=te.term();
160                         }
161                 }
162                 return bits;
163         }
164
165         public String getFieldName()
166         {
167                 return fieldName;
168         }
169
170
171         public void setFieldName(String fieldName)
172         {
173                 this.fieldName = fieldName;
174         }
175
176
177         public int getKeepMode()
178         {
179                 return keepMode;
180         }
181
182
183         public void setKeepMode(int keepMode)
184         {
185                 this.keepMode = keepMode;
186         }
187
188
189         @Override
190         public boolean equals(Object obj)
191         {
192                 if(this == obj)
193                         return true;
194                 if((obj == null) || (obj.getClass() != this.getClass()))
195                         return false;
196                 DuplicateFilter other = (DuplicateFilter)obj;
197                 return keepMode == other.keepMode &&
198                 processingMode == other.processingMode &&
199                         (fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
200         }
201
202
203
204         @Override
205         public int hashCode()
206         {
207                 int hash = 217;
208                 hash = 31 * hash + keepMode;
209                 hash = 31 * hash + processingMode;
210                 hash = 31 * hash + fieldName.hashCode();
211                 return hash;    
212         }
213
214
215         public int getProcessingMode()
216         {
217                 return processingMode;
218         }
219
220
221         public void setProcessingMode(int processingMode)
222         {
223                 this.processingMode = processingMode;
224         }
225         
226         
227
228 }