1 package org.apache.lucene.search;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.util.HashSet;
23 import org.apache.lucene.analysis.MockAnalyzer;
24 import org.apache.lucene.document.Document;
25 import org.apache.lucene.document.Field;
26 import org.apache.lucene.index.IndexReader;
27 import org.apache.lucene.index.RandomIndexWriter;
28 import org.apache.lucene.index.Term;
29 import org.apache.lucene.index.TermDocs;
30 import org.apache.lucene.store.Directory;
31 import org.apache.lucene.util.LuceneTestCase;
33 public class DuplicateFilterTest extends LuceneTestCase {
34 private static final String KEY_FIELD = "url";
35 private Directory directory;
36 private IndexReader reader;
37 TermQuery tq=new TermQuery(new Term("text","lucene"));
38 private IndexSearcher searcher;
41 public void setUp() throws Exception {
43 directory = newDirectory();
44 RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
46 //Add series of docs with filterable fields : url, text and dates flags
47 addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
48 addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
49 addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
50 addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
51 addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
52 addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
53 addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
54 addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
56 // Until we fix LUCENE-2348, the index must
57 // have only 1 segment:
60 reader = writer.getReader();
62 searcher =newSearcher(reader);
67 public void tearDown() throws Exception {
74 private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException
76 Document doc=new Document();
77 doc.add(newField(KEY_FIELD,url,Field.Store.YES,Field.Index.NOT_ANALYZED));
78 doc.add(newField("text",text,Field.Store.YES,Field.Index.ANALYZED));
79 doc.add(newField("date",date,Field.Store.YES,Field.Index.ANALYZED));
80 writer.addDocument(doc);
83 public void testDefaultFilter() throws Throwable
85 DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
86 HashSet<String> results=new HashSet<String>();
87 ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
88 for(int i=0;i<hits.length;i++)
90 Document d=searcher.doc(hits[i].doc);
91 String url=d.get(KEY_FIELD);
92 assertFalse("No duplicate urls should be returned",results.contains(url));
96 public void testNoFilter() throws Throwable
98 HashSet<String> results=new HashSet<String>();
99 ScoreDoc[] hits = searcher.search(tq, null, 1000).scoreDocs;
100 assertTrue("Default searching should have found some matches",hits.length>0);
101 boolean dupsFound=false;
102 for(int i=0;i<hits.length;i++)
104 Document d=searcher.doc(hits[i].doc);
105 String url=d.get(KEY_FIELD);
107 dupsFound=results.contains(url);
110 assertTrue("Default searching should have found duplicate urls",dupsFound);
113 public void testFastFilter() throws Throwable
115 DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
116 df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
117 HashSet<String> results=new HashSet<String>();
118 ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
119 assertTrue("Filtered searching should have found some matches",hits.length>0);
120 for(int i=0;i<hits.length;i++)
122 Document d=searcher.doc(hits[i].doc);
123 String url=d.get(KEY_FIELD);
124 assertFalse("No duplicate urls should be returned",results.contains(url));
127 assertEquals("Two urls found",2, results.size());
129 public void testKeepsLastFilter() throws Throwable
131 DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
132 df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
133 ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
134 assertTrue("Filtered searching should have found some matches",hits.length>0);
135 for(int i=0;i<hits.length;i++)
137 Document d=searcher.doc(hits[i].doc);
138 String url=d.get(KEY_FIELD);
139 TermDocs td = reader.termDocs(new Term(KEY_FIELD,url));
145 assertEquals("Duplicate urls should return last doc",lastDoc, hits[i].doc);
150 public void testKeepsFirstFilter() throws Throwable
152 DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
153 df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
154 ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
155 assertTrue("Filtered searching should have found some matches",hits.length>0);
156 for(int i=0;i<hits.length;i++)
158 Document d=searcher.doc(hits[i].doc);
159 String url=d.get(KEY_FIELD);
160 TermDocs td = reader.termDocs(new Term(KEY_FIELD,url));
164 assertEquals("Duplicate urls should return first doc",lastDoc, hits[i].doc);