1 # -*- coding: utf-8 -*-
2 # ====================================================================
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 # ====================================================================
16 from unittest import TestCase, main
17 from lucene import ThaiAnalyzer, ThaiWordFilter, StringReader, Version
18 from BaseTokenStreamTestCase import BaseTokenStreamTestCase
21 class ThaiAnalyzerTestCase(BaseTokenStreamTestCase):
23 def testOffsets(self):
24 self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
25 "JRE does not support Thai dictionary-based BreakIterator")
27 self._assertAnalyzesTo(ThaiAnalyzer(Version.LUCENE_CURRENT),
28 u"การที่ได้ต้องแสดงว่างานดี",
29 [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
30 u"ว่า", u"งาน", u"ดี" ],
31 [ 0, 3, 6, 9, 13, 17, 20, 23 ],
32 [ 3, 6, 9, 13, 17, 20, 23, 25 ])
34 def testTokenType(self):
35 self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
36 "JRE does not support Thai dictionary-based BreakIterator")
38 self._assertAnalyzesTo(ThaiAnalyzer(Version.LUCENE_CURRENT),
39 u"การที่ได้ต้องแสดงว่างานดี ๑๒๓",
40 [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
41 u"ว่า", u"งาน", u"ดี", u"๑๒๓" ],
43 [ "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
44 "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
45 "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
46 "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
49 def testPositionIncrements(self):
50 self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
51 "JRE does not support Thai dictionary-based BreakIterator")
53 analyzer = ThaiAnalyzer(Version.LUCENE_CURRENT)
55 self._assertAnalyzesTo(analyzer, u"การที่ได้ต้อง the แสดงว่างานดี",
56 [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
57 u"ว่า", u"งาน", u"ดี" ],
58 [ 0, 3, 6, 9, 18, 22, 25, 28 ],
59 [ 3, 6, 9, 13, 22, 25, 28, 30 ],
61 [ 1, 1, 1, 1, 2, 1, 1, 1 ])
63 # case that a stopword is adjacent to thai text, with no whitespace
64 self._assertAnalyzesTo(analyzer, u"การที่ได้ต้องthe แสดงว่างานดี",
65 [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
66 u"ว่า", u"งาน", u"ดี" ],
67 [ 0, 3, 6, 9, 17, 21, 24, 27 ],
68 [ 3, 6, 9, 13, 21, 24, 27, 29 ],
70 [ 1, 1, 1, 1, 2, 1, 1, 1 ])
72 def testAnalyzer30(self):
74 analyzer = ThaiAnalyzer(Version.LUCENE_30)
76 self._assertAnalyzesTo(analyzer, u"", [])
78 self._assertAnalyzesTo(analyzer,
79 u"การที่ได้ต้องแสดงว่างานดี",
80 [ u"การ", u"ที่", u"ได้", u"ต้อง",
81 u"แสดง", u"ว่า", u"งาน", u"ดี" ])
83 self._assertAnalyzesTo(analyzer,
84 u"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
85 [ u"บริษัท", u"ชื่อ", u"xy&z", u"คุย", u"กับ", u"xyz@demo.com" ])
88 self._assertAnalyzesTo(analyzer,
89 u"ประโยคว่า The quick brown fox jumped over the lazy dogs",
90 [ u"ประโยค", u"ว่า", u"quick", u"brown", u"fox",
91 u"jumped", u"over", u"lazy", u"dogs" ])
94 if __name__ == "__main__":
97 if ThaiWordFilter.DBBI_AVAILABLE:
98 if '-loop' in sys.argv:
99 sys.argv.remove('-loop')
108 print >>sys.stderr, "Thai not supported by this JVM, tests skipped"