test/test_ThaiAnalyzer.py

   1 # -*- coding: utf-8 -*-
   2 # ====================================================================
   3 #   Licensed under the Apache License, Version 2.0 (the "License");
   4 #   you may not use this file except in compliance with the License.
   5 #   You may obtain a copy of the License at
   6 #
   7 #       http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 #   Unless required by applicable law or agreed to in writing, software
  10 #   distributed under the License is distributed on an "AS IS" BASIS,
  11 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 #   See the License for the specific language governing permissions and
  13 #   limitations under the License.
  14 # ====================================================================
  15
  16 from unittest import TestCase, main
  17 from lucene import ThaiAnalyzer, ThaiWordFilter, StringReader, Version
  18 from BaseTokenStreamTestCase import BaseTokenStreamTestCase
  19
  20
  21 class ThaiAnalyzerTestCase(BaseTokenStreamTestCase):
  22
  23     def testOffsets(self):
  24         self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
  25                      "JRE does not support Thai dictionary-based BreakIterator")
  26
  27         self._assertAnalyzesTo(ThaiAnalyzer(Version.LUCENE_CURRENT),
  28                                u"การที่ได้ต้องแสดงว่างานดี",
  29                                [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
  30                                  u"ว่า", u"งาน", u"ดี" ],
  31                                [ 0, 3, 6, 9, 13, 17, 20, 23 ],
  32                                [ 3, 6, 9, 13, 17, 20, 23, 25 ])
  33
  34     def testTokenType(self):
  35         self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
  36                      "JRE does not support Thai dictionary-based BreakIterator")
  37
  38         self._assertAnalyzesTo(ThaiAnalyzer(Version.LUCENE_CURRENT),
  39                                u"การที่ได้ต้องแสดงว่างานดี ๑๒๓",
  40                                [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
  41                                  u"ว่า", u"งาน", u"ดี", u"๑๒๓" ],
  42                                None, None,
  43                                [ "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
  44                                  "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
  45                                  "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
  46                                  "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
  47                                  "<NUM>" ])
  48
  49     def testPositionIncrements(self):
  50         self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
  51                      "JRE does not support Thai dictionary-based BreakIterator")
  52
  53         analyzer = ThaiAnalyzer(Version.LUCENE_CURRENT)
  54
  55         self._assertAnalyzesTo(analyzer, u"การที่ได้ต้อง the แสดงว่างานดี",
  56                                [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
  57                                  u"ว่า", u"งาน", u"ดี" ],
  58                                [ 0, 3, 6, 9, 18, 22, 25, 28 ],
  59                                [ 3, 6, 9, 13, 22, 25, 28, 30 ],
  60                                None,
  61                                [ 1, 1, 1, 1, 2, 1, 1, 1 ])
  62
  63         # case that a stopword is adjacent to thai text, with no whitespace
  64         self._assertAnalyzesTo(analyzer, u"การที่ได้ต้องthe แสดงว่างานดี",
  65                                [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
  66                                  u"ว่า", u"งาน", u"ดี" ],
  67                                [ 0, 3, 6, 9, 17, 21, 24, 27 ],
  68                                [ 3, 6, 9, 13, 21, 24, 27, 29 ],
  69                                None,
  70                                [ 1, 1, 1, 1, 2, 1, 1, 1 ])
  71
  72     def testAnalyzer30(self):
  73
  74         analyzer = ThaiAnalyzer(Version.LUCENE_30)
  75
  76         self._assertAnalyzesTo(analyzer, u"", [])
  77
  78         self._assertAnalyzesTo(analyzer,
  79                                u"การที่ได้ต้องแสดงว่างานดี",
  80                                [ u"การ", u"ที่", u"ได้", u"ต้อง",
  81                                  u"แสดง", u"ว่า", u"งาน", u"ดี" ])
  82
  83         self._assertAnalyzesTo(analyzer,
  84                                u"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
  85                                [ u"บริษัท", u"ชื่อ", u"xy&z", u"คุย", u"กับ", u"xyz@demo.com" ])
  86
  87         # English stop words
  88         self._assertAnalyzesTo(analyzer,
  89                                u"ประโยคว่า The quick brown fox jumped over the lazy dogs",
  90                                [ u"ประโยค", u"ว่า", u"quick", u"brown", u"fox",
  91                                  u"jumped", u"over", u"lazy", u"dogs" ])
  92
  93
  94 if __name__ == "__main__":
  95     import sys, lucene
  96     lucene.initVM()
  97     if ThaiWordFilter.DBBI_AVAILABLE:
  98         if '-loop' in sys.argv:
  99             sys.argv.remove('-loop')
 100             while True:
 101                 try:
 102                     main()
 103                 except:
 104                     pass
 105         else:
 106             main()
 107     else:
 108         print >>sys.stderr, "Thai not supported by this JVM, tests skipped"