old python needs __main__ to call a module
[pylucene.git] / test / test_Sort.py
1 # ====================================================================
2 #   Licensed under the Apache License, Version 2.0 (the "License");
3 #   you may not use this file except in compliance with the License.
4 #   You may obtain a copy of the License at
5 #
6 #       http://www.apache.org/licenses/LICENSE-2.0
7 #
8 #   Unless required by applicable law or agreed to in writing, software
9 #   distributed under the License is distributed on an "AS IS" BASIS,
10 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 #   See the License for the specific language governing permissions and
12 #   limitations under the License.
13 # ====================================================================
14
15 import math
16
17 from itertools import izip
18 from random import randint
19 from unittest import TestCase, main
20 from lucene import *
21
22 NUM_STRINGS = 6000
23
24
25
26 class SortTestCase(TestCase):
27     """
28     Unit tests for sorting code, ported from Java Lucene
29     """
30
31     def __init__(self, *args, **kwds):
32
33         super(SortTestCase, self).__init__(*args, **kwds)
34
35         self.data = [
36     #      tracer  contents         int            float           string   custom   i18n               long                  double,                short,                byte,           custom parser encoding'
37         [   "A",   "x a",           "5",           "4f",           "c",    "A-3",   u"p\u00EAche",      "10",                  "-4.0",                "3",                  "126",          "J"  ],
38         [   "B",   "y a",           "5",           "3.4028235E38", "i",    "B-10",  "HAT",             "1000000000",          "40.0",                "24",                 "1",            "I"  ],
39         [   "C",   "x a b c",       "2147483647",  "1.0",          "j",    "A-2",   u"p\u00E9ch\u00E9", "99999999",            "40.00002343",         "125",                "15",           "H"  ],
40         [   "D",   "y a b c",       "-1",          "0.0f",         "a",     "C-0",   "HUT",             str(Long.MAX_VALUE),  str(Double.MIN_VALUE), str(Short.MIN_VALUE), str(Byte.MIN_VALUE), "G"  ],
41         [   "E",   "x a b c d",     "5",           "2f",           "h",     "B-8",   "peach",           str(Long.MIN_VALUE),  str(Double.MAX_VALUE), str(Short.MAX_VALUE), str(Byte.MAX_VALUE), "F"  ],
42         [   "F",   "y a b c d",     "2",           "3.14159f",     "g",     "B-1",   u"H\u00C5T",        "-44",                "343.034435444",       "-3",                 "0",            "E"  ],
43         [   "G",   "x a b c d",     "3",           "-1.0",         "f",     "C-100", "sin",             "323254543543",       "4.043544",            "5",                  "100",          "D"  ],
44         [   "H",   "y a b c d",     "0",           "1.4E-45",      "e",     "C-88",  u"H\u00D8T",        "1023423423005",      "4.043545",            "10",                 "-50",          "C"  ],
45         [   "I",   "x a b c d e f", "-2147483648", "1.0e+0",       "d",     "A-10",  u"s\u00EDn",        "332422459999",       "4.043546",            "-340",               "51",           "B"  ],
46         [   "J",   "y a b c d e f", "4",           ".5",           "b",     "C-7",   "HOT",             "34334543543",        "4.0000220343",        "300",                "2",            "A"  ],
47         [   "W",   "g",             "1",           None,           None,    None,    None,              None,                 None,                  None,                 None,           None  ],
48         [   "X",   "g",             "1",           "0.1",          None,    None,    None,              None,                 None,                  None,                 None,           None  ],
49         [   "Y",   "g",             "1",           "0.2",          None,    None,    None,              None,                 None,                  None,                 None,           None  ],
50         [   "Z",   "f g",           None,          None,           None,    None,    None,              None,                 None,                  None,                 None,           None  ],
51         ]
52
53     def _getIndex(self, even, odd):
54
55         indexStore = RAMDirectory()
56         writer = IndexWriter(indexStore, SimpleAnalyzer(), True,
57                              IndexWriter.MaxFieldLength.LIMITED)
58         writer.setMaxBufferedDocs(2)
59         writer.setMergeFactor(1000)
60
61         for i in xrange(len(self.data)):
62             if (i % 2 == 0 and even) or (i % 2 == 1 and odd):
63                 doc = Document()
64                 doc.add(Field("tracer", self.data[i][0], Field.Store.YES,
65                               Field.Index.NO))
66                 doc.add(Field("contents", self.data[i][1], Field.Store.NO,
67                               Field.Index.ANALYZED))
68                 if self.data[i][2] is not None:
69                     doc.add(Field("int", self.data[i][2], Field.Store.NO,
70                                   Field.Index.NOT_ANALYZED))
71                 if self.data[i][3] is not None:
72                     doc.add(Field("float", self.data[i][3], Field.Store.NO,
73                                   Field.Index.NOT_ANALYZED))
74                 if self.data[i][4] is not None:
75                     doc.add(Field("string", self.data[i][4], Field.Store.NO,
76                                   Field.Index.NOT_ANALYZED))
77                 if self.data[i][5] is not None:
78                     doc.add(Field("custom", self.data[i][5], Field.Store.NO,
79                                   Field.Index.NOT_ANALYZED))
80                 if self.data[i][6] is not None:
81                     doc.add(Field("i18n", self.data[i][6], Field.Store.NO,
82                                   Field.Index.NOT_ANALYZED))
83                 if self.data[i][7] is not None:
84                     doc.add(Field("long", self.data[i][7], Field.Store.NO,
85                                   Field.Index.NOT_ANALYZED))
86                 if self.data[i][8] is not None:
87                     doc.add(Field("double", self.data[i][8], Field.Store.NO,
88                                   Field.Index.NOT_ANALYZED))
89                 if self.data[i][9] is not None:
90                     doc.add(Field("short", self.data[i][9], Field.Store.NO,
91                                   Field.Index.NOT_ANALYZED))
92                 if self.data[i][10] is not None:
93                     doc.add(Field("byte", self.data[i][10], Field.Store.NO,
94                                   Field.Index.NOT_ANALYZED))
95                 if self.data[i][11] is not None:
96                     doc.add(Field("parser", self.data[i][11], Field.Store.NO,
97                                   Field.Index.NOT_ANALYZED))
98                 doc.setBoost(2.0)  # produce some scores above 1.0
99                 writer.addDocument(doc)
100         # writer.optimize()
101         writer.close()
102         s = IndexSearcher(indexStore, True)
103         s.setDefaultFieldSortScoring(True, True)
104
105         return s
106
107     def _getFullIndex(self):
108         return self._getIndex(True, True)
109
110     def getFullStrings(self):
111
112         indexStore = RAMDirectory()
113         writer = IndexWriter(indexStore, SimpleAnalyzer(), True,
114                              IndexWriter.MaxFieldLength.LIMITED)
115         writer.setMaxBufferedDocs(4)
116         writer.setMergeFactor(97)
117         
118         for i in xrange(NUM_STRINGS):
119             doc = Document()
120             num = self.getRandomCharString(self.getRandomNumber(2, 8), 48, 52)
121             doc.add(Field("tracer", num, Field.Store.YES, Field.Index.NO))
122             # doc.add(Field("contents", str(i), Field.Store.NO,
123             #         Field.Index.ANALYZED))
124             doc.add(Field("string", num, Field.Store.NO,
125                           Field.Index.NOT_ANALYZED))
126             num2 = self.getRandomCharString(self.getRandomNumber(1, 4), 48, 50)
127             doc.add(Field("string2", num2, Field.Store.NO,
128                           Field.Index.NOT_ANALYZED))
129             doc.add(Field("tracer2", num2, Field.Store.YES, Field.Index.NO))
130             doc.setBoost(2.0)  # produce some scores above 1.0
131             writer.setMaxBufferedDocs(self.getRandomNumber(2, 12))
132             writer.addDocument(doc)
133       
134         # writer.optimize()
135         # print writer.getSegmentCount()
136         writer.close()
137
138         return IndexSearcher(indexStore, True)
139   
140     def getRandomNumberString(self, num, low, high):
141
142         return ''.join([self.getRandomNumber(low, high) for i in xrange(num)])
143   
144     def getRandomCharString(self, num):
145
146         return self.getRandomCharString(num, 48, 122)
147   
148     def getRandomCharString(self, num,  start, end):
149         
150         return ''.join([chr(self.getRandomNumber(start, end))
151                         for i in xrange(num)])
152   
153     def getRandomNumber(self, low, high):
154   
155         return randint(low, high)
156
157     def _getXIndex(self):
158         return self._getIndex(True, False)
159
160     def _getYIndex(self):
161         return self._getIndex(False, True)
162
163     def _getEmptyIndex(self):
164         return self._getIndex(False, False)
165
166     def setUp(self):
167
168         self.full = self._getFullIndex()
169         self.searchX = self._getXIndex()
170         self.searchY = self._getYIndex()
171         self.queryX = TermQuery(Term("contents", "x"))
172         self.queryY = TermQuery(Term("contents", "y"))
173         self.queryA = TermQuery(Term("contents", "a"))
174         self.queryE = TermQuery(Term("contents", "e"))
175         self.queryF = TermQuery(Term("contents", "f"))
176         self.queryG = TermQuery(Term("contents", "g"))
177
178     def testBuiltInSorts(self):
179         """
180         test the sorts by score and document number
181         """
182
183         sort = Sort()
184         self._assertMatches(self.full, self.queryX, sort, "ACEGI")
185         self._assertMatches(self.full, self.queryY, sort, "BDFHJ")
186
187         sort.setSort(SortField.FIELD_DOC)
188         self._assertMatches(self.full, self.queryX, sort, "ACEGI")
189         self._assertMatches(self.full, self.queryY, sort, "BDFHJ")
190
191     def testTypedSort(self):
192         """
193         test sorts where the type of field is specified
194         """
195
196         sort = Sort()
197
198         sort.setSort([SortField("int", SortField.INT),
199                       SortField.FIELD_DOC])
200         self._assertMatches(self.full, self.queryX, sort, "IGAEC")
201         self._assertMatches(self.full, self.queryY, sort, "DHFJB")
202
203         sort.setSort([SortField("float", SortField.FLOAT),
204                       SortField.FIELD_DOC])
205         self._assertMatches(self.full, self.queryX, sort, "GCIEA")
206         self._assertMatches(self.full, self.queryY, sort, "DHJFB")
207
208         sort.setSort([SortField("long", SortField.LONG),
209                       SortField.FIELD_DOC])
210         self._assertMatches(self.full, self.queryX, sort, "EACGI")
211         self._assertMatches(self.full, self.queryY, sort, "FBJHD")
212
213         sort.setSort([SortField("double", SortField.DOUBLE),
214                       SortField.FIELD_DOC])
215         self._assertMatches(self.full, self.queryX, sort, "AGICE")
216         self._assertMatches(self.full, self.queryY, sort, "DJHBF")
217
218         sort.setSort([SortField("byte", SortField.BYTE),
219                       SortField.FIELD_DOC])
220         self._assertMatches(self.full, self.queryX, sort, "CIGAE")
221         self._assertMatches(self.full, self.queryY, sort, "DHFBJ")
222
223         sort.setSort([SortField("short", SortField.SHORT),
224                       SortField.FIELD_DOC])
225         self._assertMatches(self.full, self.queryX, sort, "IAGCE")
226         self._assertMatches(self.full, self.queryY, sort, "DFHBJ")
227
228         sort.setSort([SortField("string", SortField.STRING),
229                       SortField.FIELD_DOC])
230         self._assertMatches(self.full, self.queryX, sort, "AIGEC")
231         self._assertMatches(self.full, self.queryY, sort, "DJHFB")
232   
233     def testStringSort(self):
234         """
235         Test String sorting: small queue to many matches, multi field sort,
236         reverse sort
237         """
238
239         sort = Sort()
240         searcher = self.getFullStrings()
241
242         sort.setSort([SortField("string", SortField.STRING),
243                       SortField("string2", SortField.STRING, True),
244                       SortField.FIELD_DOC])
245
246         result = searcher.search(MatchAllDocsQuery(), None, 500, sort).scoreDocs
247
248         buff = []
249         last = None
250         lastSub = None
251         lastDocId = 0
252         fail = False
253
254         for scoreDoc in result:
255             doc2 = searcher.doc(scoreDoc.doc)
256             v = doc2.getValues("tracer")
257             v2 = doc2.getValues("tracer2")
258             for _v, _v2 in izip(v, v2):
259                 if last is not None:
260                     _cmp = cmp(_v, last)
261                     if _cmp < 0: # ensure first field is in order
262                         fail = True
263                         print "fail:", _v, "<", last
264
265                     if _cmp == 0: # ensure second field is in reverse order
266                         _cmp = cmp(_v2, lastSub)
267                         if _cmp > 0:
268                             fail = True
269                             print "rev field fail:", _v2, ">", lastSub
270                         elif _cmp == 0: # ensure docid is in order
271                             if scoreDoc.doc < lastDocId:
272                                 fail = True
273                                 print "doc fail:", scoreDoc.doc, ">", lastDocId
274
275                 last = _v
276                 lastSub = _v2
277                 lastDocId = scoreDoc.doc
278                 buff.append(_v + "(" + _v2 + ")(" + str(scoreDoc.doc) + ") ")
279
280         if fail:
281             print "topn field1(field2)(docID):", ''.join(buff)
282
283         self.assert_(not fail, "Found sort results out of order")
284   
285     def testCustomFieldParserSort(self):
286         """
287         test sorts where the type of field is specified and a custom field
288         parser is used, that uses a simple char encoding. The sorted string
289         contains a character beginning from 'A' that is mapped to a numeric
290         value using some "funny" algorithm to be different for each data
291         type.
292         """
293
294         # since tests explicitly use different parsers on the same field name
295         # we explicitly check/purge the FieldCache between each assertMatch
296         fc = FieldCache.DEFAULT
297         
298         class intParser(PythonIntParser):
299             def parseInt(_self, val):
300                 return (ord(val[0]) - ord('A')) * 123456
301
302         class floatParser(PythonFloatParser):
303             def parseFloat(_self, val):
304                 return math.sqrt(ord(val[0]))
305
306         class longParser(PythonLongParser):
307             def parseLong(_self, val):
308                 return (ord(val[0]) - ord('A')) * 1234567890L
309
310         class doubleParser(PythonDoubleParser):
311             def parseDouble(_self, val):
312                 return math.pow(ord(val[0]), ord(val[0]) - ord('A'))
313
314         class byteParser(PythonByteParser):
315             def parseByte(_self, val):
316                 return chr(ord(val[0]) - ord('A'))
317
318         class shortParser(PythonShortParser):
319             def parseShort(_self, val):
320                 return ord(val[0]) - ord('A')
321
322         sort = Sort()
323         sort.setSort([SortField("parser", intParser()),
324                       SortField.FIELD_DOC])
325         self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
326         self._assertSaneFieldCaches(self.getName() + " IntParser")
327         fc.purgeAllCaches()
328
329         sort.setSort([SortField("parser", floatParser()),
330                       SortField.FIELD_DOC])
331         self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
332         self._assertSaneFieldCaches(self.getName() + " FloatParser")
333         fc.purgeAllCaches()
334
335         sort.setSort([SortField("parser", longParser()),
336                            SortField.FIELD_DOC])
337         self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
338         self._assertSaneFieldCaches(self.getName() + " LongParser")
339         fc.purgeAllCaches()
340
341         sort.setSort([SortField("parser", doubleParser()),
342                       SortField.FIELD_DOC])
343         self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
344         self._assertSaneFieldCaches(self.getName() + " DoubleParser")
345         fc.purgeAllCaches()
346
347         sort.setSort([SortField("parser", byteParser()),
348                       SortField.FIELD_DOC])
349         self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
350         self._assertSaneFieldCaches(self.getName() + " ByteParser")
351         fc.purgeAllCaches()
352
353         sort.setSort([SortField("parser", shortParser()),
354                       SortField.FIELD_DOC])
355         self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
356         self._assertSaneFieldCaches(self.getName() + " ShortParser")
357         fc.purgeAllCaches()
358
359     def testEmptyIndex(self):
360         """
361         test sorts when there's nothing in the index
362         """
363
364         sort = Sort()
365         empty = self._getEmptyIndex()
366
367         self._assertMatches(empty, self.queryX, sort, "")
368
369         sort.setSort(SortField.FIELD_DOC)
370         self._assertMatches(empty, self.queryX, sort, "")
371
372         sort.setSort([SortField("int", SortField.INT), SortField.FIELD_DOC])
373         self._assertMatches(empty, self.queryX, sort, "")
374
375         sort.setSort([SortField("string", SortField.STRING, True),
376                       SortField.FIELD_DOC])
377         self._assertMatches(empty, self.queryX, sort, "")
378
379         sort.setSort([SortField("float", SortField.FLOAT),
380                       SortField("string", SortField.STRING)])
381         self._assertMatches(empty, self.queryX, sort, "")
382
383
384     def testNewCustomFieldParserSort(self):
385         """
386         Test sorting w/ custom FieldComparator
387         """
388         sort = Sort()
389
390         sort.setSort([SortField("parser", MyFieldComparatorSource())])
391         self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
392
393     def testReverseSort(self):
394         """
395         test sorts in reverse
396         """
397         sort = Sort()
398
399         sort.setSort([SortField(None, SortField.SCORE, True),
400                       SortField.FIELD_DOC])
401         self._assertMatches(self.full, self.queryX, sort, "IEGCA")
402         self._assertMatches(self.full, self.queryY, sort, "JFHDB")
403
404         sort.setSort(SortField(None, SortField.DOC, True))
405         self._assertMatches(self.full, self.queryX, sort, "IGECA")
406         self._assertMatches(self.full, self.queryY, sort, "JHFDB")
407
408         sort.setSort(SortField("int", SortField.INT, True))
409         self._assertMatches(self.full, self.queryX, sort, "CAEGI")
410         self._assertMatches(self.full, self.queryY, sort, "BJFHD")
411
412         sort.setSort(SortField("float", SortField.FLOAT, True))
413         self._assertMatches(self.full, self.queryX, sort, "AECIG")
414         self._assertMatches(self.full, self.queryY, sort, "BFJHD")
415
416         sort.setSort(SortField("string", SortField.STRING, True))
417         self._assertMatches(self.full, self.queryX, sort, "CEGIA")
418         self._assertMatches(self.full, self.queryY, sort, "BFHJD")
419
420     def testEmptyFieldSort(self):
421         """
422         test sorting when the sort field is empty(undefined) for some of the
423         documents
424         """
425         sort = Sort()
426
427         sort.setSort(SortField("string", SortField.STRING))
428         self._assertMatches(self.full, self.queryF, sort, "ZJI")
429
430         sort.setSort(SortField("string", SortField.STRING, True))
431         self._assertMatches(self.full, self.queryF, sort, "IJZ")
432     
433         sort.setSort(SortField("i18n", Locale.ENGLISH))
434         self._assertMatches(self.full, self.queryF, sort, "ZJI")
435     
436         sort.setSort(SortField("i18n", Locale.ENGLISH, True))
437         self._assertMatches(self.full, self.queryF, sort, "IJZ")
438
439         sort.setSort(SortField("int", SortField.INT))
440         self._assertMatches(self.full, self.queryF, sort, "IZJ")
441
442         sort.setSort(SortField("int", SortField.INT, True))
443         self._assertMatches(self.full, self.queryF, sort, "JZI")
444
445         sort.setSort(SortField("float", SortField.FLOAT))
446         self._assertMatches(self.full, self.queryF, sort, "ZJI")
447
448         # using a nonexisting field as first sort key shouldn't make a
449         # difference:
450         sort.setSort([SortField("nosuchfield", SortField.STRING),
451                       SortField("float", SortField.FLOAT)])
452         self._assertMatches(self.full, self.queryF, sort, "ZJI")
453
454         sort.setSort(SortField("float", SortField.FLOAT, True))
455         self._assertMatches(self.full, self.queryF, sort, "IJZ")
456
457         # When a field is None for both documents, the next SortField should
458         # be used. 
459         # Works for
460         sort.setSort([SortField("int", SortField.INT),
461                       SortField("string", SortField.STRING),
462                       SortField("float", SortField.FLOAT)])
463         self._assertMatches(self.full, self.queryG, sort, "ZWXY")
464
465         # Reverse the last criterium to make sure the test didn't pass by
466         # chance 
467         sort.setSort([SortField("int", SortField.INT),
468                       SortField("string", SortField.STRING),
469                       SortField("float", SortField.FLOAT, True)])
470         self._assertMatches(self.full, self.queryG, sort, "ZYXW")
471
472         # Do the same for a MultiSearcher
473         multiSearcher = MultiSearcher([self.full])
474
475         sort.setSort([SortField("int", SortField.INT),
476                       SortField("string", SortField.STRING),
477                       SortField("float", SortField.FLOAT)])
478         self._assertMatches(multiSearcher, self.queryG, sort, "ZWXY")
479
480         sort.setSort([SortField("int", SortField.INT),
481                       SortField("string", SortField.STRING),
482                       SortField("float", SortField.FLOAT, True)])
483         self._assertMatches(multiSearcher, self.queryG, sort, "ZYXW")
484
485         # Don't close the multiSearcher. it would close the full searcher too!
486         # Do the same for a ParallelMultiSearcher
487         parallelSearcher = ParallelMultiSearcher([self.full])
488
489         sort.setSort([SortField("int", SortField.INT),
490                       SortField("string", SortField.STRING),
491                       SortField("float", SortField.FLOAT)])
492         self._assertMatches(parallelSearcher, self.queryG, sort, "ZWXY")
493
494         sort.setSort([SortField("int", SortField.INT),
495                       SortField("string", SortField.STRING),
496                       SortField("float", SortField.FLOAT, True)])
497         self._assertMatches(parallelSearcher, self.queryG, sort, "ZYXW")
498
499         # Don't close the parallelSearcher. it would close the full searcher
500         # too!
501
502     def testSortCombos(self):
503         """
504         test sorts using a series of fields
505         """
506         sort = Sort()
507
508         sort.setSort([SortField("int", SortField.INT),
509                       SortField("float", SortField.FLOAT)])
510         self._assertMatches(self.full, self.queryX, sort, "IGEAC")
511
512         sort.setSort([SortField("int", SortField.INT, True),
513                       SortField(None, SortField.DOC, True)])
514         self._assertMatches(self.full, self.queryX, sort, "CEAGI")
515
516         sort.setSort([SortField("float", SortField.FLOAT),
517                       SortField("string", SortField.STRING)])
518         self._assertMatches(self.full, self.queryX, sort, "GICEA")
519
520     def testLocaleSort(self):
521         """
522         test using a Locale for sorting strings
523         """
524         sort = Sort()
525
526         sort.setSort([SortField("string", Locale.US)])
527         self._assertMatches(self.full, self.queryX, sort, "AIGEC")
528         self._assertMatches(self.full, self.queryY, sort, "DJHFB")
529
530         sort.setSort([SortField("string", Locale.US, True)])
531         self._assertMatches(self.full, self.queryX, sort, "CEGIA")
532         self._assertMatches(self.full, self.queryY, sort, "BFHJD")
533
534     def testInternationalSort(self):
535         """
536         test using various international locales with accented characters
537         (which sort differently depending on locale)
538         """
539         sort = Sort()
540
541         sort.setSort(SortField("i18n", Locale.US))
542         self._assertMatches(self.full, self.queryY, sort, "BFJDH")
543
544         sort.setSort(SortField("i18n", Locale("sv", "se")))
545         self._assertMatches(self.full, self.queryY, sort, "BJDFH")
546
547         sort.setSort(SortField("i18n", Locale("da", "dk")))
548         self._assertMatches(self.full, self.queryY, sort, "BJDHF")
549
550         sort.setSort(SortField("i18n", Locale.US))
551         self._assertMatches(self.full, self.queryX, sort, "ECAGI")
552
553         sort.setSort(SortField("i18n", Locale.FRANCE))
554         self._assertMatches(self.full, self.queryX, sort, "EACGI")
555
556     def testInternationalMultiSearcherSort(self):
557         """
558         Test the MultiSearcher's ability to preserve locale-sensitive ordering
559         by wrapping it around a single searcher
560         """
561         sort = Sort()
562
563         multiSearcher = MultiSearcher([self.full])
564         sort.setSort(SortField("i18n", Locale("sv", "se")))
565         self._assertMatches(multiSearcher, self.queryY, sort, "BJDFH")
566     
567         sort.setSort(SortField("i18n", Locale.US))
568         self._assertMatches(multiSearcher, self.queryY, sort, "BFJDH")
569     
570         sort.setSort(SortField("i18n", Locale("da", "dk")))
571         self._assertMatches(multiSearcher, self.queryY, sort, "BJDHF")
572     
573     def testMultiSort(self):
574         """
575         test a variety of sorts using more than one searcher
576         """
577         
578         searcher = MultiSearcher([self.searchX, self.searchY])
579         self.runMultiSorts(searcher, False)
580
581     def testParallelMultiSort(self):
582         """
583         test a variety of sorts using a parallel multisearcher
584         """
585
586         searcher = ParallelMultiSearcher([self.searchX, self.searchY])
587         self.runMultiSorts(searcher, False)
588
589     def testNormalizedScores(self):
590         """
591         test that the relevancy scores are the same even if
592         hits are sorted
593         """
594
595         # capture relevancy scores
596         scoresX = self.getScores(self.full.search(self.queryX, None,
597                                                   1000).scoreDocs, self.full)
598         scoresY = self.getScores(self.full.search(self.queryY, None,
599                                                   1000).scoreDocs, self.full)
600         scoresA = self.getScores(self.full.search(self.queryA, None,
601                                                   1000).scoreDocs, self.full)
602
603         # we'll test searching locally, remote and multi
604         multi = MultiSearcher([self.searchX, self.searchY])
605
606         # change sorting and make sure relevancy stays the same
607
608         sort = Sort()
609         self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
610         self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
611         self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
612         self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
613         self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
614         self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
615
616         sort.setSort(SortField.FIELD_DOC)
617         self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
618         self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
619         self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
620         self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
621         self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
622         self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
623
624         sort.setSort(SortField("int", SortField.INT))
625         self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
626         self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
627         self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
628         self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
629         self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
630         self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
631
632         sort.setSort(SortField("float", SortField.FLOAT))
633         self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
634         self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
635         self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
636         self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
637         self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
638         self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
639
640         sort.setSort(SortField("string", SortField.STRING))
641         self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
642         self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
643         self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
644         self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
645         self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
646         self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
647
648         sort.setSort([SortField("int", SortField.INT),
649                       SortField("float", SortField.FLOAT)])
650         self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
651         self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
652         self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
653         self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
654         self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
655         self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
656
657         sort.setSort([SortField("int", SortField.INT, True),
658                       SortField(None, SortField.DOC, True)])
659         self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
660         self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
661         self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
662         self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
663         self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
664         self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
665
666         sort.setSort([SortField("float", SortField.FLOAT),
667                       SortField("string", SortField.STRING)])
668         self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
669         self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
670         self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
671         self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
672         self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
673         self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
674
675     def testTopDocsScores(self):
676         """
677         There was previously a bug in FieldSortedHitQueue.maxscore when only
678         a single doc was added.  That is what the following tests for.
679         """
680         
681         sort = Sort()
682         nDocs = 10
683
684         # try to pick a query that will result in an unnormalized
685         # score greater than 1 to test for correct normalization
686         docs1 = self.full.search(self.queryE, None, nDocs, sort)
687
688         # a filter that only allows through the first hit
689         class filter(PythonFilter):
690             def getDocIdSet(_self, reader):
691                 bs = BitSet(reader.maxDoc())
692                 bs.set(0, reader.maxDoc())
693                 bs.set(docs1.scoreDocs[0].doc)
694                 return DocIdBitSet(bs)
695
696         filt = filter()
697
698         docs2 = self.full.search(self.queryE, filt, nDocs, sort)
699         self.assertEqual(docs1.scoreDocs[0].score,
700                          docs2.scoreDocs[0].score,
701                          1e-6)
702   
703     def testSortWithoutFillFields(self):
704         """
705         There was previously a bug in TopFieldCollector when fillFields was
706         set to False - the same doc and score was set in ScoreDoc[]
707         array. This test asserts that if fillFields is False, the documents
708         are set properly. It does not use Searcher's default search
709         methods(with Sort) since all set fillFields to True.
710         """
711
712         sorts = [Sort(SortField.FIELD_DOC), Sort()]
713         for sort in sorts:
714             q = MatchAllDocsQuery()
715             tdc = TopFieldCollector.create(sort, 10, False,
716                                            False, False, True)
717             self.full.search(q, tdc)
718       
719             sds = tdc.topDocs().scoreDocs
720             for i in xrange(1, len(sds)):
721                 self.assert_(sds[i].doc != sds[i - 1].doc)
722
723     def testSortWithoutScoreTracking(self):
724         """
725         Two Sort criteria to instantiate the multi/single comparators.
726         """
727
728         sorts = [Sort(SortField.FIELD_DOC), Sort()]
729         for sort in sorts:
730             q = MatchAllDocsQuery()
731             tdc = TopFieldCollector.create(sort, 10, True, False,
732                                            False, True)
733       
734             self.full.search(q, tdc)
735       
736             tds = tdc.topDocs()
737             sds = tds.scoreDocs
738             for sd in sds:
739                 self.assert_(Float.isNaN_(sd.score))
740
741             self.assert_(Float.isNaN_(tds.getMaxScore()))
742
743     def testSortWithScoreNoMaxScoreTracking(self):
744         """
745         Two Sort criteria to instantiate the multi/single comparators.
746         """
747         
748         sorts = [Sort(SortField.FIELD_DOC), Sort()]
749         for sort in sorts:
750             q = MatchAllDocsQuery()
751             tdc = TopFieldCollector.create(sort, 10, True, True,
752                                            False, True)
753       
754             self.full.search(q, tdc)
755       
756             tds = tdc.topDocs()
757             sds = tds.scoreDocs
758             for sd in sds:
759                 self.assert_(not Float.isNaN_(sd.score))
760
761             self.assert_(Float.isNaN_(tds.getMaxScore()))
762   
763     def testSortWithScoreAndMaxScoreTracking(self):
764         """
765         Two Sort criteria to instantiate the multi/single comparators.
766         """
767         
768         sorts = [Sort(SortField.FIELD_DOC), Sort()]
769         for sort in sorts:
770             q = MatchAllDocsQuery()
771             tdc = TopFieldCollector.create(sort, 10, True, True,
772                                            True, True)
773       
774             self.full.search(q, tdc)
775       
776             tds = tdc.topDocs()
777             sds = tds.scoreDocs
778             for sd in sds:
779                 self.assert_(not Float.isNaN_(sd.score))
780
781             self.assert_(not Float.isNaN_(tds.getMaxScore()))
782
783     def testOutOfOrderDocsScoringSort(self):
784         """
785         Two Sort criteria to instantiate the multi/single comparators.
786         """
787
788         sorts = [Sort(SortField.FIELD_DOC), Sort()]
789
790         tfcOptions = [[False, False, False],
791                       [False, False, True],
792                       [False, True, False],
793                       [False, True, True],
794                       [True, False, False],
795                       [True, False, True],
796                       [True, True, False],
797                       [True, True, True]]
798
799         actualTFCClasses = [
800             "OutOfOrderOneComparatorNonScoringCollector", 
801             "OutOfOrderOneComparatorScoringMaxScoreCollector", 
802             "OutOfOrderOneComparatorScoringNoMaxScoreCollector", 
803             "OutOfOrderOneComparatorScoringMaxScoreCollector", 
804             "OutOfOrderOneComparatorNonScoringCollector", 
805             "OutOfOrderOneComparatorScoringMaxScoreCollector", 
806             "OutOfOrderOneComparatorScoringNoMaxScoreCollector", 
807             "OutOfOrderOneComparatorScoringMaxScoreCollector" 
808         ]
809     
810         bq = BooleanQuery()
811
812         # Add a Query with SHOULD, since bw.scorer() returns BooleanScorer2
813         # which delegates to BS if there are no mandatory clauses.
814         bq.add(MatchAllDocsQuery(), BooleanClause.Occur.SHOULD)
815
816         # Set minNrShouldMatch to 1 so that BQ will not optimize rewrite to
817         # return the clause instead of BQ.
818         bq.setMinimumNumberShouldMatch(1)
819
820         for sort in sorts:
821             for tfcOption, actualTFCClass in izip(tfcOptions,
822                                                   actualTFCClasses):
823                 tdc = TopFieldCollector.create(sort, 10, tfcOption[0],
824                                                tfcOption[1], tfcOption[2],
825                                                False)
826
827                 self.assert_(tdc.getClass().getName().endswith("$" + actualTFCClass))
828           
829                 self.full.search(bq, tdc)
830           
831                 tds = tdc.topDocs()
832                 sds = tds.scoreDocs  
833                 self.assertEqual(10, len(sds))
834   
835     def testSortWithScoreAndMaxScoreTrackingNoResults(self):
836         """
837         Two Sort criteria to instantiate the multi/single comparators.
838         """
839
840         sorts = [Sort(SortField.FIELD_DOC), Sort()]
841         for sort in sorts:
842             tdc = TopFieldCollector.create(sort, 10, True, True, True, True)
843             tds = tdc.topDocs()
844             self.assertEqual(0, tds.totalHits)
845             self.assert_(Float.isNaN_(tds.getMaxScore()))
846   
847     def runMultiSorts(self, multi, isFull):
848         """
849         runs a variety of sorts useful for multisearchers
850         """
851         sort = Sort()
852
853         sort.setSort(SortField.FIELD_DOC)
854         expected = isFull and "ABCDEFGHIJ" or "ACEGIBDFHJ"
855         self._assertMatches(multi, self.queryA, sort, expected)
856
857         sort.setSort(SortField("int", SortField.INT))
858         expected = isFull and "IDHFGJABEC" or "IDHFGJAEBC"
859         self._assertMatches(multi, self.queryA, sort, expected)
860
861         sort.setSort([SortField("int", SortField.INT), SortField.FIELD_DOC])
862         expected = isFull and "IDHFGJABEC" or "IDHFGJAEBC"
863         self._assertMatches(multi, self.queryA, sort, expected)
864
865         sort.setSort(SortField("int", SortField.INT))
866         expected = isFull and "IDHFGJABEC" or "IDHFGJAEBC"
867         self._assertMatches(multi, self.queryA, sort, expected)
868
869         sort.setSort([SortField("float", SortField.FLOAT), SortField.FIELD_DOC])
870         self._assertMatches(multi, self.queryA, sort, "GDHJCIEFAB")
871
872         sort.setSort(SortField("float", SortField.FLOAT))
873         self._assertMatches(multi, self.queryA, sort, "GDHJCIEFAB")
874
875         sort.setSort(SortField("string", SortField.STRING))
876         self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC")
877
878         sort.setSort(SortField("int", SortField.INT, True))
879         expected = isFull and "CABEJGFHDI" or "CAEBJGFHDI"
880         self._assertMatches(multi, self.queryA, sort, expected)
881
882         sort.setSort(SortField("float", SortField.FLOAT, True))
883         self._assertMatches(multi, self.queryA, sort, "BAFECIJHDG")
884
885         sort.setSort(SortField("string", SortField.STRING, True))
886         self._assertMatches(multi, self.queryA, sort, "CBEFGHIAJD")
887
888         sort.setSort([SortField("int", SortField.INT),
889                       SortField("float", SortField.FLOAT)])
890         self._assertMatches(multi, self.queryA, sort, "IDHFGJEABC")
891
892         sort.setSort([SortField("float", SortField.FLOAT),
893                       SortField("string", SortField.STRING)])
894         self._assertMatches(multi, self.queryA, sort, "GDHJICEFAB")
895
896         sort.setSort(SortField("int", SortField.INT))
897         self._assertMatches(multi, self.queryF, sort, "IZJ")
898
899         sort.setSort(SortField("int", SortField.INT, True))
900         self._assertMatches(multi, self.queryF, sort, "JZI")
901
902         sort.setSort(SortField("float", SortField.FLOAT))
903         self._assertMatches(multi, self.queryF, sort, "ZJI")
904
905         sort.setSort(SortField("string", SortField.STRING))
906         self._assertMatches(multi, self.queryF, sort, "ZJI")
907
908         sort.setSort(SortField("string", SortField.STRING, True))
909         self._assertMatches(multi, self.queryF, sort, "IJZ")
910
911         # up to this point, all of the searches should have "sane" 
912         # FieldCache behavior, and should have reused hte cache in several
913         # cases 
914         self._assertSaneFieldCaches(self.getName() + " various")
915         
916         # next we'll check Locale based(String[]) for 'string', so purge first
917         FieldCache.DEFAULT.purgeAllCaches()
918
919         sort.setSort([SortField("string", Locale.US)])
920         self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC")
921
922         sort.setSort([SortField("string", Locale.US, True)])
923         self._assertMatches(multi, self.queryA, sort, "CBEFGHIAJD")
924
925         sort.setSort([SortField("string", Locale.UK)])
926         self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC")
927
928         self._assertSaneFieldCaches(self.getName() + " Locale.US + Locale.UK")
929         FieldCache.DEFAULT.purgeAllCaches()
930
931     def _assertMatches(self, searcher, query, sort, expectedResult):
932         """
933         make sure the documents returned by the search match the expected
934         list
935         """
936
937         # ScoreDoc[] result = searcher.search(query, None, 1000, sort).scoreDocs
938         hits = searcher.search(query, None, len(expectedResult) or 1, sort)
939         sds = hits.scoreDocs
940
941         self.assertEqual(hits.totalHits, len(expectedResult))
942         buff = []
943         for sd in sds:
944             doc = searcher.doc(sd.doc)
945             v = doc.getValues("tracer")
946             for _v in v:
947                 buff.append(_v)
948
949         self.assertEqual(expectedResult, ''.join(buff))
950
951     def getScores(self, hits, searcher):
952
953         scoreMap = {}
954         for hit in hits:
955             doc = searcher.doc(hit.doc)
956             v = doc.getValues("tracer")
957             self.assertEqual(len(v), 1)
958             scoreMap[v[0]] = hit.score
959
960         return scoreMap
961
962     def _assertSameValues(self, m1, m2):
963         """
964         make sure all the values in the maps match
965         """
966
967         self.assertEquals(len(m1), len(m2))
968         for key in m1.iterkeys():
969             self.assertEquals(m1[key], m2[key], 1e-6)
970
971     def getName(self):
972
973         return type(self).__name__
974
975     def _assertSaneFieldCaches(self, msg):
976
977         entries = FieldCache.DEFAULT.getCacheEntries()
978
979         insanity = FieldCacheSanityChecker.checkSanity(entries)
980         self.assertEqual(0, len(insanity),
981                          msg + ": Insane FieldCache usage(s) found")
982
983
984 class MyFieldComparator(PythonFieldComparator):
985
986     def __init__(self, numHits):
987         super(MyFieldComparator, self).__init__()
988         self.slotValues = [0] * numHits
989
990     def copy(self, slot, doc):
991         self.slotValues[slot] = self.docValues[doc]
992
993     def compare(self, slot1, slot2):
994         return self.slotValues[slot1] - self.slotValues[slot2]
995
996     def compareBottom(self, doc):
997         return self.bottomValue - self.docValues[doc]
998
999     def setBottom(self, bottom):
1000         self.bottomValue = self.slotValues[bottom]
1001
1002     def setNextReader(self, reader, docBase):
1003         
1004         class intParser(PythonIntParser):
1005             def parseInt(_self, val):
1006                 return (ord(val[0]) - ord('A')) * 123456
1007                 
1008         self.docValues = FieldCache.DEFAULT.getInts(reader, "parser",
1009                                                     intParser())
1010
1011     def value(self, slot):
1012         return Integer(self.slotValues[slot])
1013
1014
1015 class MyFieldComparatorSource(PythonFieldComparatorSource):
1016
1017     def newComparator(self, fieldname, numHits, sortPos, reversed):
1018         return MyFieldComparator(numHits)
1019
1020
1021
1022 if __name__ == "__main__":
1023     import sys, lucene
1024     env = lucene.initVM()
1025     if '-loop' in sys.argv:
1026         sys.argv.remove('-loop')
1027         while True:
1028             try:
1029                 main()
1030             except:
1031                 pass
1032 #            refs = sorted(env._dumpRefs(classes=True).items(),
1033 #                          key=lambda x: x[1], reverse=True)
1034 #            print refs[0:4]
1035     else:
1036         main()