test/BaseTokenStreamTestCase.py

   1 # ====================================================================
   2 #   Licensed under the Apache License, Version 2.0 (the "License");
   3 #   you may not use this file except in compliance with the License.
   4 #   You may obtain a copy of the License at
   5 #
   6 #       http://www.apache.org/licenses/LICENSE-2.0
   7 #
   8 #   Unless required by applicable law or agreed to in writing, software
   9 #   distributed under the License is distributed on an "AS IS" BASIS,
  10 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11 #   See the License for the specific language governing permissions and
  12 #   limitations under the License.
  13 # ====================================================================
  14
  15 from unittest import TestCase, main
  16 from lucene import *
  17
  18 class BaseTokenStreamTestCase(TestCase):
  19     """
  20     Base class for all Lucene unit tests that use TokenStreams.
  21     """
  22
  23     def _assertTokenStreamContents(self, ts, output,
  24                                    startOffsets=None, endOffsets=None,
  25                                    types=None, posIncrements=None):
  26
  27         self.assert_(output is not None)
  28         self.assert_(ts.hasAttribute(TermAttribute.class_),
  29                                      "has TermAttribute")
  30
  31         termAtt = ts.getAttribute(TermAttribute.class_)
  32
  33         offsetAtt = None
  34         if startOffsets is not None or endOffsets is not None:
  35             self.assert_(ts.hasAttribute(OffsetAttribute.class_),
  36                                          "has OffsetAttribute")
  37             offsetAtt = ts.getAttribute(OffsetAttribute.class_)
  38
  39         typeAtt = None
  40         if types is not None:
  41             self.assert_(ts.hasAttribute(TypeAttribute.class_),
  42                          "has TypeAttribute")
  43             typeAtt = ts.getAttribute(TypeAttribute.class_)
  44
  45         posIncrAtt = None
  46         if posIncrements is not None:
  47             self.assert_(ts.hasAttribute(PositionIncrementAttribute.class_),
  48                          "has PositionIncrementAttribute")
  49             posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class_)
  50
  51         ts.reset()
  52         for i in xrange(len(output)):
  53             # extra safety to enforce, that the state is not preserved and
  54             # also assign bogus values
  55             ts.clearAttributes()
  56             termAtt.setTermBuffer("bogusTerm")
  57             if offsetAtt is not None:
  58                 offsetAtt.setOffset(14584724, 24683243)
  59             if typeAtt is not None:
  60                 typeAtt.setType("bogusType")
  61             if posIncrAtt is not None:
  62                 posIncrAtt.setPositionIncrement(45987657)
  63
  64             self.assert_(ts.incrementToken(), "token %d exists" %(i))
  65             self.assertEqual(output[i], termAtt.term(), "term %d" %(i))
  66             if startOffsets is not None:
  67                 self.assertEqual(startOffsets[i], offsetAtt.startOffset(),
  68                                  "startOffset %d" %(i))
  69             if endOffsets is not None:
  70                 self.assertEqual(endOffsets[i], offsetAtt.endOffset(),
  71                                  "endOffset %d" %(i))
  72             if types is not None:
  73                 self.assertEqual(types[i], typeAtt.type(), "type %d" %(i))
  74             if posIncrements is not None:
  75                 self.assertEqual(posIncrements[i],
  76                                  posIncrAtt.getPositionIncrement(),
  77                                  "posIncrement %d" %(i))
  78
  79         self.assert_(not ts.incrementToken(), "end of stream")
  80         ts.end()
  81         ts.close()
  82
  83     def _assertAnalyzesTo(self, a, input, output,
  84                           startOffsets=None, endOffsets=None,
  85                           types=None, posIncrements=None):
  86
  87         ts = a.tokenStream("dummy", StringReader(input))
  88         self._assertTokenStreamContents(ts, output, startOffsets, endOffsets,
  89                                         types, posIncrements)
  90
  91     def _assertAnalyzesToReuse(self, a, input, output,
  92                                startOffsets=None, endOffsets=None,
  93                                types=None, posIncrements=None):
  94
  95         ts = a.reusableTokenStream("dummy", StringReader(input))
  96         self._assertTokenStreamContents(ts, output, startOffsets, endOffsets,
  97                                         types, posIncrements)
  98
  99     # simple utility method for testing stemmers
 100     def _checkOneTerm(self, a, input, expected):
 101         self._assertAnalyzesTo(a, input, JArray('string')(expected))
 102
 103     def _checkOneTermReuse(self, a, input, expected):
 104         self._assertAnalyzesToReuse(a, input, JArray('string')(expected))