1 # ====================================================================
2 # Licensed under the Apache License, Version 2.0 (the "License");
3 # you may not use this file except in compliance with the License.
4 # You may obtain a copy of the License at
6 # http://www.apache.org/licenses/LICENSE-2.0
8 # Unless required by applicable law or agreed to in writing, software
9 # distributed under the License is distributed on an "AS IS" BASIS,
10 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 # See the License for the specific language governing permissions and
12 # limitations under the License.
13 # ====================================================================
15 from unittest import TestCase
16 from cStringIO import StringIO
19 WhitespaceAnalyzer, Document, Field, IndexReader, IndexWriter, Term, \
20 IndexSearcher, PhraseQuery, SpanFirstQuery, SpanNearQuery, SpanNotQuery, \
21 SpanOrQuery, SpanTermQuery, RAMDirectory, TermAttribute, StringReader
23 from lia.analysis.AnalyzerUtils import AnalyzerUtils
26 class SpanQueryTest(TestCase):
30 self.directory = RAMDirectory()
31 self.analyzer = WhitespaceAnalyzer()
33 writer = IndexWriter(self.directory, self.analyzer, True,
34 IndexWriter.MaxFieldLength.UNLIMITED)
37 doc.add(Field("f", "the quick brown fox jumps over the lazy dog",
38 Field.Store.YES, Field.Index.ANALYZED))
39 writer.addDocument(doc)
42 doc.add(Field("f", "the quick red fox jumps over the sleepy cat",
43 Field.Store.YES, Field.Index.ANALYZED))
44 writer.addDocument(doc)
48 self.searcher = IndexSearcher(self.directory, True)
49 self.reader = IndexReader.open(self.directory, True)
51 self.quick = SpanTermQuery(Term("f", "quick"))
52 self.brown = SpanTermQuery(Term("f", "brown"))
53 self.red = SpanTermQuery(Term("f", "red"))
54 self.fox = SpanTermQuery(Term("f", "fox"))
55 self.lazy = SpanTermQuery(Term("f", "lazy"))
56 self.sleepy = SpanTermQuery(Term("f", "sleepy"))
57 self.dog = SpanTermQuery(Term("f", "dog"))
58 self.cat = SpanTermQuery(Term("f", "cat"))
60 def assertOnlyBrownFox(self, query):
62 topDocs = self.searcher.search(query, 50)
63 self.assertEqual(1, topDocs.totalHits)
64 self.assertEqual(0, topDocs.scoreDocs[0].doc, "wrong doc")
66 def assertBothFoxes(self, query):
68 topDocs = self.searcher.search(query, 50)
69 self.assertEqual(2, topDocs.totalHits)
71 def assertNoMatches(self, query):
73 topDocs = self.searcher.search(query, 50)
74 self.assertEquals(0, topDocs.totalHits)
76 def testSpanTermQuery(self):
78 self.assertOnlyBrownFox(self.brown)
79 self.dumpSpans(self.brown)
81 def testSpanFirstQuery(self):
83 sfq = SpanFirstQuery(self.brown, 2)
84 self.assertNoMatches(sfq)
88 sfq = SpanFirstQuery(self.brown, 3)
90 self.assertOnlyBrownFox(sfq)
92 def testSpanNearQuery(self):
94 quick_brown_dog = [self.quick, self.brown, self.dog]
95 snq = SpanNearQuery(quick_brown_dog, 0, True)
96 self.assertNoMatches(snq)
99 snq = SpanNearQuery(quick_brown_dog, 4, True)
100 self.assertNoMatches(snq)
103 snq = SpanNearQuery(quick_brown_dog, 5, True)
104 self.assertOnlyBrownFox(snq)
107 # interesting - even a sloppy phrase query would require
109 snq = SpanNearQuery([self.lazy, self.fox], 3, False)
110 self.assertOnlyBrownFox(snq)
114 pq.add(Term("f", "lazy"))
115 pq.add(Term("f", "fox"))
117 self.assertNoMatches(pq)
120 self.assertOnlyBrownFox(pq)
122 def testSpanNotQuery(self):
124 quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
125 self.assertBothFoxes(quick_fox)
126 self.dumpSpans(quick_fox)
128 quick_fox_dog = SpanNotQuery(quick_fox, self.dog)
129 self.assertBothFoxes(quick_fox_dog)
130 self.dumpSpans(quick_fox_dog)
132 no_quick_red_fox = SpanNotQuery(quick_fox, self.red)
133 self.assertOnlyBrownFox(no_quick_red_fox)
134 self.dumpSpans(no_quick_red_fox)
136 def testSpanOrQuery(self):
138 quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
139 lazy_dog = SpanNearQuery([self.lazy, self.dog], 0, True)
140 sleepy_cat = SpanNearQuery([self.sleepy, self.cat], 0, True)
141 qf_near_ld = SpanNearQuery([quick_fox, lazy_dog], 3, True)
143 self.assertOnlyBrownFox(qf_near_ld)
144 self.dumpSpans(qf_near_ld)
146 qf_near_sc = SpanNearQuery([quick_fox, sleepy_cat], 3, True)
147 self.dumpSpans(qf_near_sc)
149 orQ = SpanOrQuery([qf_near_ld, qf_near_sc])
150 self.assertBothFoxes(orQ)
155 orQ = SpanOrQuery([self.quick, self.fox])
158 quick_fox = SpanNearQuery([self.quick, self.fox], 1, True)
159 sfq = SpanFirstQuery(quick_fox, 4)
162 self.dumpSpans(SpanTermQuery(Term("f", "the")))
164 quick_brown = SpanNearQuery([self.quick, self.brown], 0, False)
165 self.dumpSpans(quick_brown)
167 def dumpSpans(self, query):
169 spans = query.getSpans(self.reader)
173 scoreDocs = self.searcher.search(query, 50).scoreDocs
175 for scoreDoc in scoreDocs:
176 scores[scoreDoc.doc] = scoreDoc.score
182 doc = self.reader.document(id)
184 # for simplicity - assume tokens are in sequential,
185 # positions, starting from 0
186 stream = self.analyzer.tokenStream("contents",
187 StringReader(doc.get("f")))
188 term = stream.addAttribute(TermAttribute.class_)
194 while stream.incrementToken():
195 if i == spans.start():
198 buffer.write(term.term())
199 if i + 1 == spans.end():
206 buffer.write(str(scores[id]))
209 print buffer.getvalue()
210 # print self.searcher.explain(query, id)