1 # ====================================================================
2 # Licensed under the Apache License, Version 2.0 (the "License");
3 # you may not use this file except in compliance with the License.
4 # You may obtain a copy of the License at
6 # http://www.apache.org/licenses/LICENSE-2.0
8 # Unless required by applicable law or agreed to in writing, software
9 # distributed under the License is distributed on an "AS IS" BASIS,
10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 # See the License for the specific language governing permissions and
12 # limitations under the License.
13 # ====================================================================
15 from unittest import TestCase, main
19 class FuzzyQueryTestCase(TestCase):
21 Unit tests ported from Java Lucene
24 def _addDoc(self, text, writer):
27 doc.add(Field("field", text,
28 Field.Store.YES, Field.Index.ANALYZED))
29 writer.addDocument(doc)
31 def testDefaultFuzziness(self):
33 directory = RAMDirectory()
34 writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
35 IndexWriter.MaxFieldLength.LIMITED)
36 self._addDoc("aaaaa", writer)
37 self._addDoc("aaaab", writer)
38 self._addDoc("aaabb", writer)
39 self._addDoc("aabbb", writer)
40 self._addDoc("abbbb", writer)
41 self._addDoc("bbbbb", writer)
42 self._addDoc("ddddd", writer)
46 searcher = IndexSearcher(directory, True)
48 query = FuzzyQuery(Term("field", "aaaaa"))
49 topDocs = searcher.search(query, 50)
50 self.assertEqual(3, topDocs.totalHits)
53 query = FuzzyQuery(Term("field", "xxxxx"))
54 topDocs = searcher.search(query, 50)
55 self.assertEqual(0, topDocs.totalHits)
56 # edit distance to "aaaaa" = 3
57 query = FuzzyQuery(Term("field", "aaccc"))
58 topDocs = searcher.search(query, 50)
59 self.assertEqual(0, topDocs.totalHits)
61 # query identical to a word in the index:
62 query = FuzzyQuery(Term("field", "aaaaa"))
63 scoreDocs = searcher.search(query, 50).scoreDocs
64 self.assertEqual(3, len(scoreDocs))
65 self.assertEqual(searcher.doc(scoreDocs[0].doc).get("field"), "aaaaa")
66 # default allows for up to two edits:
67 self.assertEqual(searcher.doc(scoreDocs[1].doc).get("field"), "aaaab")
68 self.assertEqual(searcher.doc(scoreDocs[2].doc).get("field"), "aaabb")
70 # query similar to a word in the index:
71 query = FuzzyQuery(Term("field", "aaaac"))
72 scoreDocs = searcher.search(query, 50).scoreDocs
73 self.assertEqual(3, len(scoreDocs))
74 self.assertEqual(searcher.doc(scoreDocs[0].doc).get("field"), "aaaaa")
75 self.assertEqual(searcher.doc(scoreDocs[1].doc).get("field"), "aaaab")
76 self.assertEqual(searcher.doc(scoreDocs[2].doc).get("field"), "aaabb")
78 query = FuzzyQuery(Term("field", "ddddX"))
79 scoreDocs = searcher.search(query, 50).scoreDocs
80 self.assertEqual(1, len(scoreDocs))
81 self.assertEqual(searcher.doc(scoreDocs[0].doc).get("field"), "ddddd")
83 # different field = no match:
84 query = FuzzyQuery(Term("anotherfield", "ddddX"))
85 topDocs = searcher.search(query, 50)
86 self.assertEqual(0, topDocs.totalHits)
91 def testDefaultFuzzinessLong(self):
93 directory = RAMDirectory()
94 writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
95 IndexWriter.MaxFieldLength.LIMITED)
96 self._addDoc("aaaaaaa", writer)
97 self._addDoc("segment", writer)
100 searcher = IndexSearcher(directory, True)
102 # not similar enough:
103 query = FuzzyQuery(Term("field", "xxxxx"))
104 topDocs = searcher.search(query, 50)
105 self.assertEqual(0, topDocs.totalHits)
106 # edit distance to "aaaaaaa" = 3, this matches because
107 # the string is longer than
108 # in testDefaultFuzziness so a bigger difference is allowed:
109 query = FuzzyQuery(Term("field", "aaaaccc"))
110 scoreDocs = searcher.search(query, 50).scoreDocs
111 self.assertEqual(1, len(scoreDocs))
112 self.assertEqual(searcher.doc(scoreDocs[0].doc).get("field"), "aaaaaaa")
114 # no match, more than half of the characters is wrong:
115 query = FuzzyQuery(Term("field", "aaacccc"))
116 topDocs = searcher.search(query, 50)
117 self.assertEqual(0, topDocs.totalHits)
119 # "student" and "stellent" are indeed similar to "segment" by default:
120 query = FuzzyQuery(Term("field", "student"))
121 topDocs = searcher.search(query, 50)
122 self.assertEqual(1, topDocs.totalHits)
123 query = FuzzyQuery(Term("field", "stellent"))
124 topDocs = searcher.search(query, 50)
125 self.assertEqual(1, topDocs.totalHits)
131 if __name__ == "__main__":
134 if '-loop' in sys.argv:
135 sys.argv.remove('-loop')