1 # -*- coding: utf-8 -*-
 
   2 # ====================================================================
 
   3 #   Licensed under the Apache License, Version 2.0 (the "License");
 
   4 #   you may not use this file except in compliance with the License.
 
   5 #   You may obtain a copy of the License at
 
   7 #       http://www.apache.org/licenses/LICENSE-2.0
 
   9 #   Unless required by applicable law or agreed to in writing, software
 
  10 #   distributed under the License is distributed on an "AS IS" BASIS,
 
  11 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
  12 #   See the License for the specific language governing permissions and
 
  13 #   limitations under the License.
 
  14 # ====================================================================
 
  16 from unittest import TestCase, main
 
  17 from lucene import ThaiAnalyzer, ThaiWordFilter, StringReader, Version
 
  18 from BaseTokenStreamTestCase import BaseTokenStreamTestCase
 
  21 class ThaiAnalyzerTestCase(BaseTokenStreamTestCase):
 
  23     def testOffsets(self):
 
  24         self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
 
  25                      "JRE does not support Thai dictionary-based BreakIterator")
 
  27         self._assertAnalyzesTo(ThaiAnalyzer(Version.LUCENE_CURRENT),
 
  28                                u"การที่ได้ต้องแสดงว่างานดี", 
 
  29                                [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
 
  30                                  u"ว่า", u"งาน", u"ดี" ],
 
  31                                [ 0, 3, 6, 9, 13, 17, 20, 23 ],
 
  32                                [ 3, 6, 9, 13, 17, 20, 23, 25 ])
 
  34     def testTokenType(self):
 
  35         self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
 
  36                      "JRE does not support Thai dictionary-based BreakIterator")
 
  38         self._assertAnalyzesTo(ThaiAnalyzer(Version.LUCENE_CURRENT),
 
  39                                u"การที่ได้ต้องแสดงว่างานดี ๑๒๓", 
 
  40                                [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
 
  41                                  u"ว่า", u"งาน", u"ดี", u"๑๒๓" ],
 
  43                                [ "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>", 
 
  44                                  "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>", 
 
  45                                  "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
 
  46                                  "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
 
  49     def testPositionIncrements(self):
 
  50         self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
 
  51                      "JRE does not support Thai dictionary-based BreakIterator")
 
  53         analyzer = ThaiAnalyzer(Version.LUCENE_CURRENT)
 
  55         self._assertAnalyzesTo(analyzer, u"การที่ได้ต้อง the แสดงว่างานดี", 
 
  56                                [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
 
  57                                  u"ว่า", u"งาน", u"ดี" ],
 
  58                                [ 0, 3, 6, 9, 18, 22, 25, 28 ],
 
  59                                [ 3, 6, 9, 13, 22, 25, 28, 30 ],
 
  61                                [ 1, 1, 1, 1, 2, 1, 1, 1 ])
 
  63         # case that a stopword is adjacent to thai text, with no whitespace
 
  64         self._assertAnalyzesTo(analyzer, u"การที่ได้ต้องthe แสดงว่างานดี", 
 
  65                                [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
 
  66                                  u"ว่า", u"งาน", u"ดี" ],
 
  67                                [ 0, 3, 6, 9, 17, 21, 24, 27 ],
 
  68                                [ 3, 6, 9, 13, 21, 24, 27, 29 ],
 
  70                                [ 1, 1, 1, 1, 2, 1, 1, 1 ])
 
  72     def testAnalyzer30(self):
 
  74         analyzer = ThaiAnalyzer(Version.LUCENE_30)
 
  76         self._assertAnalyzesTo(analyzer, u"", [])
 
  78         self._assertAnalyzesTo(analyzer,
 
  79                                u"การที่ได้ต้องแสดงว่างานดี",
 
  80                                [ u"การ", u"ที่", u"ได้", u"ต้อง",
 
  81                                  u"แสดง", u"ว่า", u"งาน", u"ดี" ])
 
  83         self._assertAnalyzesTo(analyzer,
 
  84                                u"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
 
  85                                [ u"บริษัท", u"ชื่อ", u"xy&z", u"คุย", u"กับ", u"xyz@demo.com" ])
 
  88         self._assertAnalyzesTo(analyzer,
 
  89                                u"ประโยคว่า The quick brown fox jumped over the lazy dogs",
 
  90                                [ u"ประโยค", u"ว่า", u"quick", u"brown", u"fox",
 
  91                                  u"jumped", u"over", u"lazy", u"dogs" ])
 
  94 if __name__ == "__main__":
 
  97     if ThaiWordFilter.DBBI_AVAILABLE:
 
  98         if '-loop' in sys.argv:
 
  99             sys.argv.remove('-loop')
 
 108         print >>sys.stderr, "Thai not supported by this JVM, tests skipped"