Configured build for Ubuntu and added Stempel polish analyzer
[pylucene.git] / samples / LuceneInAction / lia / advsearching / SpanQueryTest.py
1 # ====================================================================
2 #   Licensed under the Apache License, Version 2.0 (the "License");
3 #   you may not use this file except in compliance with the License.
4 #   You may obtain a copy of the License at
5 #
6 #       http://www.apache.org/licenses/LICENSE-2.0
7 #
8 #   Unless required by applicable law or agreed to in writing, software
9 #   distributed under the License is distributed on an "AS IS" BASIS,
10 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 #   See the License for the specific language governing permissions and
12 #   limitations under the License.
13 # ====================================================================
14
15 from unittest import TestCase
16 from cStringIO import StringIO
17
18 from lucene import \
19      WhitespaceAnalyzer, Document, Field, IndexReader, IndexWriter, Term, \
20      IndexSearcher, PhraseQuery, SpanFirstQuery, SpanNearQuery, SpanNotQuery, \
21      SpanOrQuery, SpanTermQuery, RAMDirectory, TermAttribute, StringReader
22
23 from lia.analysis.AnalyzerUtils import AnalyzerUtils
24
25
26 class SpanQueryTest(TestCase):
27
28     def setUp(self):
29
30         self.directory = RAMDirectory()
31         self.analyzer = WhitespaceAnalyzer()
32
33         writer = IndexWriter(self.directory, self.analyzer, True,
34                              IndexWriter.MaxFieldLength.UNLIMITED)
35
36         doc = Document()
37         doc.add(Field("f", "the quick brown fox jumps over the lazy dog",
38                       Field.Store.YES, Field.Index.ANALYZED))
39         writer.addDocument(doc)
40
41         doc = Document()
42         doc.add(Field("f", "the quick red fox jumps over the sleepy cat",
43                       Field.Store.YES, Field.Index.ANALYZED))
44         writer.addDocument(doc)
45
46         writer.close()
47
48         self.searcher = IndexSearcher(self.directory, True)
49         self.reader = IndexReader.open(self.directory, True)
50
51         self.quick = SpanTermQuery(Term("f", "quick"))
52         self.brown = SpanTermQuery(Term("f", "brown"))
53         self.red = SpanTermQuery(Term("f", "red"))
54         self.fox = SpanTermQuery(Term("f", "fox"))
55         self.lazy = SpanTermQuery(Term("f", "lazy"))
56         self.sleepy = SpanTermQuery(Term("f", "sleepy"))
57         self.dog = SpanTermQuery(Term("f", "dog"))
58         self.cat = SpanTermQuery(Term("f", "cat"))
59
60     def assertOnlyBrownFox(self, query):
61
62         topDocs = self.searcher.search(query, 50)
63         self.assertEqual(1, topDocs.totalHits)
64         self.assertEqual(0, topDocs.scoreDocs[0].doc, "wrong doc")
65
66     def assertBothFoxes(self, query):
67
68         topDocs = self.searcher.search(query, 50)
69         self.assertEqual(2, topDocs.totalHits)
70
71     def assertNoMatches(self, query):
72
73         topDocs = self.searcher.search(query, 50)
74         self.assertEquals(0, topDocs.totalHits)
75
76     def testSpanTermQuery(self):
77
78         self.assertOnlyBrownFox(self.brown)
79         self.dumpSpans(self.brown)
80
81     def testSpanFirstQuery(self):
82
83         sfq = SpanFirstQuery(self.brown, 2)
84         self.assertNoMatches(sfq)
85
86         self.dumpSpans(sfq)
87
88         sfq = SpanFirstQuery(self.brown, 3)
89         self.dumpSpans(sfq)
90         self.assertOnlyBrownFox(sfq)
91
92     def testSpanNearQuery(self):
93
94         quick_brown_dog = [self.quick, self.brown, self.dog]
95         snq = SpanNearQuery(quick_brown_dog, 0, True)
96         self.assertNoMatches(snq)
97         self.dumpSpans(snq)
98
99         snq = SpanNearQuery(quick_brown_dog, 4, True)
100         self.assertNoMatches(snq)
101         self.dumpSpans(snq)
102
103         snq = SpanNearQuery(quick_brown_dog, 5, True)
104         self.assertOnlyBrownFox(snq)
105         self.dumpSpans(snq)
106
107         # interesting - even a sloppy phrase query would require
108         # more slop to match
109         snq = SpanNearQuery([self.lazy, self.fox], 3, False)
110         self.assertOnlyBrownFox(snq)
111         self.dumpSpans(snq)
112
113         pq = PhraseQuery()
114         pq.add(Term("f", "lazy"))
115         pq.add(Term("f", "fox"))
116         pq.setSlop(4)
117         self.assertNoMatches(pq)
118
119         pq.setSlop(5)
120         self.assertOnlyBrownFox(pq)
121
122     def testSpanNotQuery(self):
123
124         quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
125         self.assertBothFoxes(quick_fox)
126         self.dumpSpans(quick_fox)
127
128         quick_fox_dog = SpanNotQuery(quick_fox, self.dog)
129         self.assertBothFoxes(quick_fox_dog)
130         self.dumpSpans(quick_fox_dog)
131
132         no_quick_red_fox = SpanNotQuery(quick_fox, self.red)
133         self.assertOnlyBrownFox(no_quick_red_fox)
134         self.dumpSpans(no_quick_red_fox)
135
136     def testSpanOrQuery(self):
137
138         quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
139         lazy_dog = SpanNearQuery([self.lazy, self.dog], 0, True)
140         sleepy_cat = SpanNearQuery([self.sleepy, self.cat], 0, True)
141         qf_near_ld = SpanNearQuery([quick_fox, lazy_dog], 3, True)
142
143         self.assertOnlyBrownFox(qf_near_ld)
144         self.dumpSpans(qf_near_ld)
145
146         qf_near_sc = SpanNearQuery([quick_fox, sleepy_cat], 3, True)
147         self.dumpSpans(qf_near_sc)
148
149         orQ = SpanOrQuery([qf_near_ld, qf_near_sc])
150         self.assertBothFoxes(orQ)
151         self.dumpSpans(orQ)
152
153     def testPlay(self):
154
155         orQ = SpanOrQuery([self.quick, self.fox])
156         self.dumpSpans(orQ)
157
158         quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
159         sfq = SpanFirstQuery(quick_fox, 4)
160         self.dumpSpans(sfq)
161
162         self.dumpSpans(SpanTermQuery(Term("f", "the")))
163
164         quick_brown = SpanNearQuery([self.quick, self.brown], 0, False)
165         self.dumpSpans(quick_brown)
166
167     def dumpSpans(self, query):
168
169         spans = query.getSpans(self.reader)
170         print "%s:" % query
171         numSpans = 0
172
173         scoreDocs = self.searcher.search(query, 50).scoreDocs
174         scores = [0, 0]
175         for scoreDoc in scoreDocs:
176             scores[scoreDoc.doc] = scoreDoc.score
177
178         while spans.next():
179             numSpans += 1
180
181             id = spans.doc()
182             doc = self.reader.document(id)
183
184             # for simplicity - assume tokens are in sequential,
185             # positions, starting from 0
186             stream = self.analyzer.tokenStream("contents",
187                                                StringReader(doc.get("f")))
188             term = stream.addAttribute(TermAttribute.class_)
189       
190             buffer = StringIO()
191             buffer.write("   ")
192
193             i = 0
194             while stream.incrementToken():
195                 if i == spans.start():
196                     buffer.write("<")
197
198                 buffer.write(term.term())
199                 if i + 1 == spans.end():
200                     buffer.write(">")
201
202                 buffer.write(" ")
203                 i += 1
204       
205             buffer.write("(")
206             buffer.write(str(scores[id]))
207             buffer.write(") ")
208
209             print buffer.getvalue()
210             # print self.searcher.explain(query, id)
211
212         if numSpans == 0:
213             print "   No spans"
214
215         print ''