pylucene 3.5.0-3
[pylucene.git] / test / test_ThaiAnalyzer.py
1 # -*- coding: utf-8 -*-
2 # ====================================================================
3 #   Licensed under the Apache License, Version 2.0 (the "License");
4 #   you may not use this file except in compliance with the License.
5 #   You may obtain a copy of the License at
6 #
7 #       http://www.apache.org/licenses/LICENSE-2.0
8 #
9 #   Unless required by applicable law or agreed to in writing, software
10 #   distributed under the License is distributed on an "AS IS" BASIS,
11 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 #   See the License for the specific language governing permissions and
13 #   limitations under the License.
14 # ====================================================================
15
16 from unittest import TestCase, main
17 from lucene import ThaiAnalyzer, ThaiWordFilter, StringReader, Version
18 from BaseTokenStreamTestCase import BaseTokenStreamTestCase
19
20
21 class ThaiAnalyzerTestCase(BaseTokenStreamTestCase):
22
23     def testOffsets(self):
24         self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
25                      "JRE does not support Thai dictionary-based BreakIterator")
26
27         self._assertAnalyzesTo(ThaiAnalyzer(Version.LUCENE_CURRENT),
28                                u"การที่ได้ต้องแสดงว่างานดี", 
29                                [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
30                                  u"ว่า", u"งาน", u"ดี" ],
31                                [ 0, 3, 6, 9, 13, 17, 20, 23 ],
32                                [ 3, 6, 9, 13, 17, 20, 23, 25 ])
33
34     def testTokenType(self):
35         self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
36                      "JRE does not support Thai dictionary-based BreakIterator")
37
38         self._assertAnalyzesTo(ThaiAnalyzer(Version.LUCENE_CURRENT),
39                                u"การที่ได้ต้องแสดงว่างานดี ๑๒๓", 
40                                [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
41                                  u"ว่า", u"งาน", u"ดี", u"๑๒๓" ],
42                                None, None,
43                                [ "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>", 
44                                  "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>", 
45                                  "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
46                                  "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
47                                  "<NUM>" ])
48
49     def testPositionIncrements(self):
50         self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
51                      "JRE does not support Thai dictionary-based BreakIterator")
52
53         analyzer = ThaiAnalyzer(Version.LUCENE_CURRENT)
54
55         self._assertAnalyzesTo(analyzer, u"การที่ได้ต้อง the แสดงว่างานดี", 
56                                [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
57                                  u"ว่า", u"งาน", u"ดี" ],
58                                [ 0, 3, 6, 9, 18, 22, 25, 28 ],
59                                [ 3, 6, 9, 13, 22, 25, 28, 30 ],
60                                None,
61                                [ 1, 1, 1, 1, 2, 1, 1, 1 ])
62          
63         # case that a stopword is adjacent to thai text, with no whitespace
64         self._assertAnalyzesTo(analyzer, u"การที่ได้ต้องthe แสดงว่างานดี", 
65                                [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
66                                  u"ว่า", u"งาน", u"ดี" ],
67                                [ 0, 3, 6, 9, 17, 21, 24, 27 ],
68                                [ 3, 6, 9, 13, 21, 24, 27, 29 ],
69                                None,
70                                [ 1, 1, 1, 1, 2, 1, 1, 1 ])
71
72     def testAnalyzer30(self):
73
74         analyzer = ThaiAnalyzer(Version.LUCENE_30)
75     
76         self._assertAnalyzesTo(analyzer, u"", [])
77
78         self._assertAnalyzesTo(analyzer,
79                                u"การที่ได้ต้องแสดงว่างานดี",
80                                [ u"การ", u"ที่", u"ได้", u"ต้อง",
81                                  u"แสดง", u"ว่า", u"งาน", u"ดี" ])
82
83         self._assertAnalyzesTo(analyzer,
84                                u"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
85                                [ u"บริษัท", u"ชื่อ", u"xy&z", u"คุย", u"กับ", u"xyz@demo.com" ])
86
87         # English stop words
88         self._assertAnalyzesTo(analyzer,
89                                u"ประโยคว่า The quick brown fox jumped over the lazy dogs",
90                                [ u"ประโยค", u"ว่า", u"quick", u"brown", u"fox",
91                                  u"jumped", u"over", u"lazy", u"dogs" ])
92
93
94 if __name__ == "__main__":
95     import sys, lucene
96     lucene.initVM()
97     if ThaiWordFilter.DBBI_AVAILABLE:
98         if '-loop' in sys.argv:
99             sys.argv.remove('-loop')
100             while True:
101                 try:
102                     main()
103                 except:
104                     pass
105         else:
106             main()
107     else:
108         print >>sys.stderr, "Thai not supported by this JVM, tests skipped"