add --shared
[pylucene.git] / lucene-java-3.4.0 / lucene / contrib / queries / src / test / org / apache / lucene / search / DuplicateFilterTest.java
1 package org.apache.lucene.search;
2
3 /**
4  * Licensed to the Apache Software Foundation (ASF) under one or more
5  * contributor license agreements.  See the NOTICE file distributed with
6  * this work for additional information regarding copyright ownership.
7  * The ASF licenses this file to You under the Apache License, Version 2.0
8  * (the "License"); you may not use this file except in compliance with
9  * the License.  You may obtain a copy of the License at
10  *
11  *     http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19
20 import java.io.IOException;
21 import java.util.HashSet;
22
23 import org.apache.lucene.analysis.MockAnalyzer;
24 import org.apache.lucene.document.Document;
25 import org.apache.lucene.document.Field;
26 import org.apache.lucene.index.IndexReader;
27 import org.apache.lucene.index.RandomIndexWriter;
28 import org.apache.lucene.index.Term;
29 import org.apache.lucene.index.TermDocs;
30 import org.apache.lucene.store.Directory;
31 import org.apache.lucene.util.LuceneTestCase;
32
33 public class DuplicateFilterTest extends LuceneTestCase {
34         private static final String KEY_FIELD = "url";
35         private Directory directory;
36         private IndexReader reader;
37         TermQuery tq=new TermQuery(new Term("text","lucene"));
38         private IndexSearcher searcher;
39
40         @Override
41         public void setUp() throws Exception {
42     super.setUp();
43                 directory = newDirectory();
44                 RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
45                 
46                 //Add series of docs with filterable fields : url, text and dates  flags
47                 addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
48                 addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
49                 addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");           
50                 addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
51                 addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
52                 addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
53                 addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
54                 addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
55
56                 // Until we fix LUCENE-2348, the index must
57                 // have only 1 segment:
58                 writer.optimize();
59
60                 reader = writer.getReader();
61                 writer.close();                 
62                 searcher =newSearcher(reader);
63                 
64         }
65         
66         @Override
67         public void tearDown() throws Exception {
68                 reader.close();
69                 searcher.close();
70                 directory.close();
71                 super.tearDown();
72         }
73
74         private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException
75         {
76                 Document doc=new Document();
77                 doc.add(newField(KEY_FIELD,url,Field.Store.YES,Field.Index.NOT_ANALYZED));
78                 doc.add(newField("text",text,Field.Store.YES,Field.Index.ANALYZED));
79                 doc.add(newField("date",date,Field.Store.YES,Field.Index.ANALYZED));
80                 writer.addDocument(doc);
81         }
82                 
83         public void testDefaultFilter() throws Throwable
84         {
85                 DuplicateFilter df=new DuplicateFilter(KEY_FIELD);              
86                 HashSet<String> results=new HashSet<String>();
87                 ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
88                 for(int i=0;i<hits.length;i++)
89                 {
90                         Document d=searcher.doc(hits[i].doc);
91                         String url=d.get(KEY_FIELD);
92                         assertFalse("No duplicate urls should be returned",results.contains(url));
93                         results.add(url);
94                 }
95         }
96         public void testNoFilter() throws Throwable
97         {
98                 HashSet<String> results=new HashSet<String>();
99                 ScoreDoc[] hits = searcher.search(tq, null, 1000).scoreDocs;
100                 assertTrue("Default searching should have found some matches",hits.length>0);
101                 boolean dupsFound=false;
102                 for(int i=0;i<hits.length;i++)
103                 {
104                         Document d=searcher.doc(hits[i].doc);
105                         String url=d.get(KEY_FIELD);
106                         if(!dupsFound)
107                                 dupsFound=results.contains(url);
108                         results.add(url);
109                 }
110                 assertTrue("Default searching should have found duplicate urls",dupsFound);
111         }
112         
113         public void testFastFilter() throws Throwable
114         {
115                 DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
116                 df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
117                 HashSet<String> results=new HashSet<String>();
118                 ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
119                 assertTrue("Filtered searching should have found some matches",hits.length>0);
120                 for(int i=0;i<hits.length;i++)
121                 {
122                         Document d=searcher.doc(hits[i].doc);
123                         String url=d.get(KEY_FIELD);
124                         assertFalse("No duplicate urls should be returned",results.contains(url));
125                         results.add(url);
126                 }
127                 assertEquals("Two urls found",2, results.size());
128         }       
129         public void testKeepsLastFilter() throws Throwable
130         {
131                 DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
132                 df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
133                 ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
134                 assertTrue("Filtered searching should have found some matches",hits.length>0);
135                 for(int i=0;i<hits.length;i++)
136                 {
137                         Document d=searcher.doc(hits[i].doc);
138                         String url=d.get(KEY_FIELD);
139                         TermDocs td = reader.termDocs(new Term(KEY_FIELD,url));
140                         int lastDoc=0;
141                         while(td.next())
142                         {
143                                 lastDoc=td.doc();
144                         }
145                         assertEquals("Duplicate urls should return last doc",lastDoc, hits[i].doc);
146                 }
147         }       
148         
149         
150         public void testKeepsFirstFilter() throws Throwable
151         {
152                 DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
153                 df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
154                 ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
155                 assertTrue("Filtered searching should have found some matches",hits.length>0);
156                 for(int i=0;i<hits.length;i++)
157                 {
158                         Document d=searcher.doc(hits[i].doc);
159                         String url=d.get(KEY_FIELD);
160                         TermDocs td = reader.termDocs(new Term(KEY_FIELD,url));
161                         int lastDoc=0;
162                         td.next();
163                         lastDoc=td.doc();
164                         assertEquals("Duplicate urls should return first doc",lastDoc, hits[i].doc);
165                 }
166         }       
167         
168         
169 }