test/test_StopAnalyzer.py

   1 # ====================================================================
   2 #   Licensed under the Apache License, Version 2.0 (the "License");
   3 #   you may not use this file except in compliance with the License.
   4 #   You may obtain a copy of the License at
   5 #
   6 #       http://www.apache.org/licenses/LICENSE-2.0
   7 #
   8 #   Unless required by applicable law or agreed to in writing, software
   9 #   distributed under the License is distributed on an "AS IS" BASIS,
  10 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11 #   See the License for the specific language governing permissions and
  12 #   limitations under the License.
  13 # ====================================================================
  14
  15 from unittest import TestCase, main
  16 from lucene import *
  17
  18
  19 class StopAnalyzerTestCase(TestCase):
  20     """
  21     Unit tests ported from Java Lucene
  22     """
  23
  24     def setUp(self):
  25
  26         self.stop = StopAnalyzer(Version.LUCENE_CURRENT)
  27         self.invalidTokens = StopAnalyzer.ENGLISH_STOP_WORDS_SET
  28
  29     def testDefaults(self):
  30
  31         self.assert_(self.stop is not None)
  32         reader = StringReader("This is a test of the english stop analyzer")
  33         stream = self.stop.tokenStream("test", reader)
  34         self.assert_(stream is not None)
  35
  36         termAtt = stream.getAttribute(TermAttribute.class_)
  37
  38         while stream.incrementToken():
  39             self.assert_(termAtt.term() not in self.invalidTokens)
  40
  41     def testStopList(self):
  42
  43         stopWords = ["good", "test", "analyzer"]
  44         stopWordsSet = HashSet()
  45         for stopWord in stopWords:
  46             stopWordsSet.add(stopWord)
  47
  48         newStop = StopAnalyzer(Version.LUCENE_24, stopWordsSet)
  49         reader = StringReader("This is a good test of the english stop analyzer")
  50         stream = newStop.tokenStream("test", reader)
  51         self.assert_(stream is not None)
  52
  53         termAtt = stream.getAttribute(TermAttribute.class_)
  54         posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class_)
  55
  56         while stream.incrementToken():
  57             text = termAtt.term()
  58             self.assert_(text not in stopWordsSet)
  59             # by default stop tokenizer does not apply increments.
  60             self.assertEqual(1, posIncrAtt.getPositionIncrement())
  61
  62     def testStopListPositions(self):
  63
  64         stopWords = ["good", "test", "analyzer"]
  65         stopWordsSet = HashSet()
  66         for stopWord in stopWords:
  67             stopWordsSet.add(stopWord)
  68
  69         newStop = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet)
  70         reader = StringReader("This is a good test of the english stop analyzer with positions")
  71         expectedIncr = [ 1,   1, 1,          3, 1,  1,      1,            2,   1]
  72         stream = newStop.tokenStream("test", reader)
  73         self.assert_(stream is not None)
  74
  75         i = 0
  76         termAtt = stream.getAttribute(TermAttribute.class_)
  77         posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class_)
  78
  79         while stream.incrementToken():
  80             text = termAtt.term()
  81             self.assert_(text not in stopWordsSet)
  82             self.assertEqual(expectedIncr[i],
  83                              posIncrAtt.getPositionIncrement())
  84             i += 1
  85
  86
  87 if __name__ == "__main__":
  88     import sys, lucene
  89     lucene.initVM()
  90     if '-loop' in sys.argv:
  91         sys.argv.remove('-loop')
  92         while True:
  93             try:
  94                 main()
  95             except:
  96                 pass
  97     else:
  98          main()