PyLucene 3.4.0-1 import
[pylucene.git] / test / test_FuzzyQuery.py
1 # ====================================================================
2 #   Licensed under the Apache License, Version 2.0 (the "License");
3 #   you may not use this file except in compliance with the License.
4 #   You may obtain a copy of the License at
5 #
6 #       http://www.apache.org/licenses/LICENSE-2.0
7 #
8 #   Unless required by applicable law or agreed to in writing, software
9 #   distributed under the License is distributed on an "AS IS" BASIS,
10 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 #   See the License for the specific language governing permissions and
12 #   limitations under the License.
13 # ====================================================================
14
15 from unittest import TestCase, main
16 from lucene import *
17
18
19 class FuzzyQueryTestCase(TestCase):
20     """
21     Unit tests ported from Java Lucene
22     """
23
24     def _addDoc(self, text, writer):
25
26         doc = Document()
27         doc.add(Field("field", text,
28                       Field.Store.YES, Field.Index.ANALYZED))
29         writer.addDocument(doc)
30
31     def testDefaultFuzziness(self):
32
33         directory = RAMDirectory()
34         writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
35                              IndexWriter.MaxFieldLength.LIMITED)
36         self._addDoc("aaaaa", writer)
37         self._addDoc("aaaab", writer)
38         self._addDoc("aaabb", writer)
39         self._addDoc("aabbb", writer)
40         self._addDoc("abbbb", writer)
41         self._addDoc("bbbbb", writer)
42         self._addDoc("ddddd", writer)
43         writer.optimize()
44         writer.close()
45
46         searcher = IndexSearcher(directory, True)
47
48         query = FuzzyQuery(Term("field", "aaaaa"))
49         topDocs = searcher.search(query, 50)
50         self.assertEqual(3, topDocs.totalHits)
51
52         # not similar enough:
53         query = FuzzyQuery(Term("field", "xxxxx"))
54         topDocs = searcher.search(query, 50)
55         self.assertEqual(0, topDocs.totalHits)
56         # edit distance to "aaaaa" = 3
57         query = FuzzyQuery(Term("field", "aaccc"))
58         topDocs = searcher.search(query, 50)
59         self.assertEqual(0, topDocs.totalHits)
60
61         # query identical to a word in the index:
62         query = FuzzyQuery(Term("field", "aaaaa"))
63         scoreDocs = searcher.search(query, 50).scoreDocs
64         self.assertEqual(3, len(scoreDocs))
65         self.assertEqual(searcher.doc(scoreDocs[0].doc).get("field"), "aaaaa")
66         # default allows for up to two edits:
67         self.assertEqual(searcher.doc(scoreDocs[1].doc).get("field"), "aaaab")
68         self.assertEqual(searcher.doc(scoreDocs[2].doc).get("field"), "aaabb")
69
70         # query similar to a word in the index:
71         query = FuzzyQuery(Term("field", "aaaac"))
72         scoreDocs = searcher.search(query, 50).scoreDocs
73         self.assertEqual(3, len(scoreDocs))
74         self.assertEqual(searcher.doc(scoreDocs[0].doc).get("field"), "aaaaa")
75         self.assertEqual(searcher.doc(scoreDocs[1].doc).get("field"), "aaaab")
76         self.assertEqual(searcher.doc(scoreDocs[2].doc).get("field"), "aaabb")
77
78         query = FuzzyQuery(Term("field", "ddddX"))
79         scoreDocs = searcher.search(query, 50).scoreDocs
80         self.assertEqual(1, len(scoreDocs))
81         self.assertEqual(searcher.doc(scoreDocs[0].doc).get("field"), "ddddd")
82
83         # different field = no match:
84         query = FuzzyQuery(Term("anotherfield", "ddddX"))
85         topDocs = searcher.search(query, 50)
86         self.assertEqual(0, topDocs.totalHits)
87
88         searcher.close()
89         directory.close()
90
91     def testDefaultFuzzinessLong(self):
92
93         directory = RAMDirectory()
94         writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
95                              IndexWriter.MaxFieldLength.LIMITED)
96         self._addDoc("aaaaaaa", writer)
97         self._addDoc("segment", writer)
98         writer.optimize()
99         writer.close()
100         searcher = IndexSearcher(directory, True)
101
102         # not similar enough:
103         query = FuzzyQuery(Term("field", "xxxxx"))
104         topDocs = searcher.search(query, 50)
105         self.assertEqual(0, topDocs.totalHits)
106         # edit distance to "aaaaaaa" = 3, this matches because
107         # the string is longer than
108         # in testDefaultFuzziness so a bigger difference is allowed:
109         query = FuzzyQuery(Term("field", "aaaaccc"))
110         scoreDocs = searcher.search(query, 50).scoreDocs
111         self.assertEqual(1, len(scoreDocs))
112         self.assertEqual(searcher.doc(scoreDocs[0].doc).get("field"), "aaaaaaa")
113
114         # no match, more than half of the characters is wrong:
115         query = FuzzyQuery(Term("field", "aaacccc"))
116         topDocs = searcher.search(query, 50)
117         self.assertEqual(0, topDocs.totalHits)
118
119         # "student" and "stellent" are indeed similar to "segment" by default:
120         query = FuzzyQuery(Term("field", "student"))
121         topDocs = searcher.search(query, 50)
122         self.assertEqual(1, topDocs.totalHits)
123         query = FuzzyQuery(Term("field", "stellent"))
124         topDocs = searcher.search(query, 50)
125         self.assertEqual(1, topDocs.totalHits)
126
127         searcher.close()
128         directory.close()
129
130
131 if __name__ == "__main__":
132     import sys, lucene
133     lucene.initVM()
134     if '-loop' in sys.argv:
135         sys.argv.remove('-loop')
136         while True:
137             try:
138                 main()
139             except:
140                 pass
141     else:
142         main()