Configured build for Ubuntu and added Stempel polish analyzer
[pylucene.git] / test / test_PhraseQuery.py
1 # ====================================================================
2 #   Licensed under the Apache License, Version 2.0 (the "License");
3 #   you may not use this file except in compliance with the License.
4 #   You may obtain a copy of the License at
5 #
6 #       http://www.apache.org/licenses/LICENSE-2.0
7 #
8 #   Unless required by applicable law or agreed to in writing, software
9 #   distributed under the License is distributed on an "AS IS" BASIS,
10 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 #   See the License for the specific language governing permissions and
12 #   limitations under the License.
13 # ====================================================================
14
15 from unittest import TestCase, main
16 from lucene import *
17
18
19 class PhraseQueryTestCase(TestCase):
20     """
21     Unit tests ported from Java Lucene
22     """
23
24     def setUp(self):
25
26         self.directory = RAMDirectory()
27         writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
28                              IndexWriter.MaxFieldLength.LIMITED)
29     
30         doc = Document()
31         doc.add(Field("field", "one two three four five",
32                       Field.Store.YES, Field.Index.ANALYZED))
33         writer.addDocument(doc)
34     
35         writer.optimize()
36         writer.close()
37
38         self.searcher = IndexSearcher(self.directory, True)
39         self.query = PhraseQuery()
40
41     def tearDown(self):
42
43         self.searcher.close()
44         self.directory.close()
45
46     def testNotCloseEnough(self):
47
48         self.query.setSlop(2)
49         self.query.add(Term("field", "one"))
50         self.query.add(Term("field", "five"))
51         topDocs = self.searcher.search(self.query, 50)
52         self.assertEqual(0, topDocs.totalHits)
53
54     def testBarelyCloseEnough(self):
55
56         self.query.setSlop(3)
57         self.query.add(Term("field", "one"))
58         self.query.add(Term("field", "five"))
59         topDocs = self.searcher.search(self.query, 50)
60         self.assertEqual(1, topDocs.totalHits)
61
62     def testExact(self):
63         """
64         Ensures slop of 0 works for exact matches, but not reversed
65         """
66
67         # slop is zero by default
68         self.query.add(Term("field", "four"))
69         self.query.add(Term("field", "five"))
70         topDocs = self.searcher.search(self.query, 50)
71         self.assertEqual(1, topDocs.totalHits, "exact match")
72
73         self.query = PhraseQuery()
74         self.query.add(Term("field", "two"))
75         self.query.add(Term("field", "one"))
76         topDocs = self.searcher.search(self.query, 50)
77         self.assertEqual(0, topDocs.totalHits, "reverse not exact")
78
79     def testSlop1(self):
80
81         # Ensures slop of 1 works with terms in order.
82         self.query.setSlop(1)
83         self.query.add(Term("field", "one"))
84         self.query.add(Term("field", "two"))
85         topDocs = self.searcher.search(self.query, 50)
86         self.assertEqual(1, topDocs.totalHits, "in order")
87
88         # Ensures slop of 1 does not work for phrases out of order
89         # must be at least 2.
90         self.query = PhraseQuery()
91         self.query.setSlop(1)
92         self.query.add(Term("field", "two"))
93         self.query.add(Term("field", "one"))
94         topDocs = self.searcher.search(self.query, 50)
95         self.assertEqual(0, topDocs.totalHits, "reversed, slop not 2 or more")
96
97     def testOrderDoesntMatter(self):
98         """
99         As long as slop is at least 2, terms can be reversed
100         """
101
102         self.query.setSlop(2) # must be at least two for reverse order match
103         self.query.add(Term("field", "two"))
104         self.query.add(Term("field", "one"))
105         topDocs = self.searcher.search(self.query, 50)
106         self.assertEqual(1, topDocs.totalHits, "just sloppy enough")
107
108         self.query = PhraseQuery()
109         self.query.setSlop(2)
110         self.query.add(Term("field", "three"))
111         self.query.add(Term("field", "one"))
112         topDocs = self.searcher.search(self.query, 50)
113         self.assertEqual(0, topDocs.totalHits, "not sloppy enough")
114
115     def testMulipleTerms(self):
116         """
117         slop is the total number of positional moves allowed
118         to line up a phrase
119         """
120         
121         self.query.setSlop(2)
122         self.query.add(Term("field", "one"))
123         self.query.add(Term("field", "three"))
124         self.query.add(Term("field", "five"))
125         topDocs = self.searcher.search(self.query, 50)
126         self.assertEqual(1, topDocs.totalHits, "two total moves")
127
128         self.query = PhraseQuery()
129         self.query.setSlop(5) # it takes six moves to match this phrase
130         self.query.add(Term("field", "five"))
131         self.query.add(Term("field", "three"))
132         self.query.add(Term("field", "one"))
133         topDocs = self.searcher.search(self.query, 50)
134         self.assertEqual(0, topDocs.totalHits, "slop of 5 not close enough")
135
136         self.query.setSlop(6)
137         topDocs = self.searcher.search(self.query, 50)
138         self.assertEqual(1, topDocs.totalHits, "slop of 6 just right")
139
140     def testPhraseQueryWithStopAnalyzer(self):
141
142         directory = RAMDirectory()
143         stopAnalyzer = StopAnalyzer(Version.LUCENE_24)
144         writer = IndexWriter(directory, stopAnalyzer, True,
145                              IndexWriter.MaxFieldLength.LIMITED)
146         doc = Document()
147         doc.add(Field("field", "the stop words are here",
148                       Field.Store.YES, Field.Index.ANALYZED))
149         writer.addDocument(doc)
150         writer.close()
151
152         searcher = IndexSearcher(directory, True)
153
154         # valid exact phrase query
155         query = PhraseQuery()
156         query.add(Term("field","stop"))
157         query.add(Term("field","words"))
158         topDocs = searcher.search(query, 50)
159         self.assertEqual(1, topDocs.totalHits)
160
161         # currently StopAnalyzer does not leave "holes", so this matches.
162         query = PhraseQuery()
163         query.add(Term("field", "words"))
164         query.add(Term("field", "here"))
165         topDocs = searcher.search(query, 50)
166         self.assertEqual(1, topDocs.totalHits)
167
168         searcher.close()
169   
170     def testPhraseQueryInConjunctionScorer(self):
171
172         directory = RAMDirectory()
173         writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
174                              IndexWriter.MaxFieldLength.LIMITED)
175     
176         doc = Document()
177         doc.add(Field("source", "marketing info",
178                       Field.Store.YES, Field.Index.ANALYZED,
179                       Field.TermVector.YES))
180         writer.addDocument(doc)
181     
182         doc = Document()
183         doc.add(Field("contents", "foobar",
184                       Field.Store.YES, Field.Index.ANALYZED,
185                       Field.TermVector.YES))
186         doc.add(Field("source", "marketing info",
187                       Field.Store.YES, Field.Index.ANALYZED,
188                       Field.TermVector.YES))
189         writer.addDocument(doc)
190     
191         writer.optimize()
192         writer.close()
193     
194         searcher = IndexSearcher(directory, True)
195     
196         phraseQuery = PhraseQuery()
197         phraseQuery.add(Term("source", "marketing"))
198         phraseQuery.add(Term("source", "info"))
199         topDocs = searcher.search(phraseQuery, 50)
200         self.assertEqual(2, topDocs.totalHits)
201     
202         termQuery = TermQuery(Term("contents","foobar"))
203         booleanQuery = BooleanQuery()
204         booleanQuery.add(termQuery, BooleanClause.Occur.MUST)
205         booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST)
206         topDocs = searcher.search(booleanQuery, 50)
207         self.assertEqual(1, topDocs.totalHits)
208     
209         searcher.close()
210     
211         writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
212                              IndexWriter.MaxFieldLength.LIMITED)
213         doc = Document()
214         doc.add(Field("contents", "map entry woo",
215                       Field.Store.YES, Field.Index.ANALYZED,
216                       Field.TermVector.YES))
217         writer.addDocument(doc)
218
219         doc = Document()
220         doc.add(Field("contents", "woo map entry",
221                       Field.Store.YES, Field.Index.ANALYZED,
222                       Field.TermVector.YES))
223         writer.addDocument(doc)
224
225         doc = Document()
226         doc.add(Field("contents", "map foobarword entry woo",
227                       Field.Store.YES, Field.Index.ANALYZED,
228                       Field.TermVector.YES))
229         writer.addDocument(doc)
230
231         writer.optimize()
232         writer.close()
233     
234         searcher = IndexSearcher(directory, True)
235     
236         termQuery = TermQuery(Term("contents", "woo"))
237         phraseQuery = PhraseQuery()
238         phraseQuery.add(Term("contents", "map"))
239         phraseQuery.add(Term("contents", "entry"))
240     
241         topDocs = searcher.search(termQuery, 50)
242         self.assertEqual(3, topDocs.totalHits)
243         topDocs = searcher.search(phraseQuery, 50)
244         self.assertEqual(2, topDocs.totalHits)
245     
246         booleanQuery = BooleanQuery()
247         booleanQuery.add(termQuery, BooleanClause.Occur.MUST)
248         booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST)
249         topDocs = searcher.search(booleanQuery, 50)
250         self.assertEqual(2, topDocs.totalHits)
251     
252         booleanQuery = BooleanQuery()
253         booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST)
254         booleanQuery.add(termQuery, BooleanClause.Occur.MUST)
255         topDocs = searcher.search(booleanQuery, 50)
256         self.assertEqual(2, topDocs.totalHits)
257     
258         searcher.close()
259         directory.close()
260
261
262 if __name__ == "__main__":
263     import sys, lucene
264     lucene.initVM()
265     if '-loop' in sys.argv:
266         sys.argv.remove('-loop')
267         while True:
268             try:
269                 main()
270             except:
271                 pass
272     else:
273          main()