samples/LuceneInAction/lia/analysis/AnalyzerUtils.py

   1 # ====================================================================
   2 #   Licensed under the Apache License, Version 2.0 (the "License");
   3 #   you may not use this file except in compliance with the License.
   4 #   You may obtain a copy of the License at
   5 #
   6 #       http://www.apache.org/licenses/LICENSE-2.0
   7 #
   8 #   Unless required by applicable law or agreed to in writing, software
   9 #   distributed under the License is distributed on an "AS IS" BASIS,
  10 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11 #   See the License for the specific language governing permissions and
  12 #   limitations under the License.
  13 # ====================================================================
  14
  15 from lucene import \
  16      SimpleAnalyzer, StandardAnalyzer, StringReader, Version, \
  17      TermAttribute, PositionIncrementAttribute, TypeAttribute, OffsetAttribute
  18
  19
  20 class AnalyzerUtils(object):
  21
  22     def main(cls, argv):
  23
  24         print "SimpleAnalyzer"
  25         cls.displayTokensWithFullDetails(SimpleAnalyzer(),
  26                                          "The quick brown fox....")
  27
  28         print "\n----"
  29         print "StandardAnalyzer"
  30         cls.displayTokensWithFullDetails(StandardAnalyzer(Version.LUCENE_CURRENT), "I'll e-mail you at xyz@example.com")
  31
  32     def setPositionIncrement(cls, source, posIncr):
  33         attr = source.addAttribute(PositionIncrementAttribute.class_)
  34         attr.setPositionIncrement(posIncr)
  35
  36     def getPositionIncrement(cls, source):
  37         attr = source.addAttribute(PositionIncrementAttribute.class_)
  38         return attr.getPositionIncrement()
  39
  40     def setTerm(cls, source, term):
  41         attr = source.addAttribute(TermAttribute.class_)
  42         attr.setTermBuffer(term)
  43
  44     def getTerm(cls, source):
  45         attr = source.addAttribute(TermAttribute.class_)
  46         return attr.term()
  47
  48     def setType(cls, source, type):
  49         attr = source.addAttribute(TypeAttribute.class_)
  50         attr.setType(type)
  51
  52     def getType(cls, source):
  53         attr = source.addAttribute(TypeAttribute.class_)
  54         return attr.type()
  55
  56     def displayTokens(cls, analyzer, text):
  57
  58         tokenStream = analyzer.tokenStream("contents", StringReader(text))
  59         term = tokenStream.addAttribute(TermAttribute.class_)
  60
  61         while tokenStream.incrementToken():
  62             print "[%s]" %(term.term()),
  63
  64     def displayTokensWithPositions(cls, analyzer, text):
  65
  66         stream = analyzer.tokenStream("contents", StringReader(text))
  67         term = stream.addAttribute(TermAttribute.class_)
  68         posIncr = stream.addAttribute(PositionIncrementAttribute.class_)
  69
  70         position = 0
  71         while stream.incrementToken():
  72             increment = posIncr.getPositionIncrement()
  73             if increment > 0:
  74                 position = position + increment
  75                 print "\n%d:" %(position),
  76
  77             print "[%s]" %(term.term()),
  78         print
  79
  80     def displayTokensWithFullDetails(cls, analyzer, text):
  81
  82         stream = analyzer.tokenStream("contents", StringReader(text))
  83
  84         term = stream.addAttribute(TermAttribute.class_)
  85         posIncr = stream.addAttribute(PositionIncrementAttribute.class_)
  86         offset = stream.addAttribute(OffsetAttribute.class_)
  87         type = stream.addAttribute(TypeAttribute.class_)
  88
  89         position = 0
  90         while stream.incrementToken():
  91             increment = posIncr.getPositionIncrement()
  92             if increment > 0:
  93                 position = position + increment
  94                 print "\n%d:" %(position),
  95
  96             print "[%s:%d->%d:%s]" %(term.term(),
  97                                      offset.startOffset(),
  98                                      offset.endOffset(),
  99                                      type.type()),
 100         print
 101
 102     def assertAnalyzesTo(cls, analyzer, input, outputs):
 103
 104         stream = analyzer.tokenStream("field", StringReader(input))
 105         termAttr = stream.addAttribute(TermAttribute.class_)
 106         for output in outputs:
 107             if not stream.incrementToken():
 108                 raise AssertionError, 'stream.incremementToken()'
 109             if output != termAttr.term():
 110                 raise AssertionError, 'output == termAttr.term())'
 111
 112         if stream.incrementToken():
 113             raise AssertionError, 'not stream.incremementToken()'
 114
 115         stream.close()
 116
 117     main = classmethod(main)
 118     setPositionIncrement = classmethod(setPositionIncrement)
 119     getPositionIncrement = classmethod(getPositionIncrement)
 120     setTerm = classmethod(setTerm)
 121     getTerm = classmethod(getTerm)
 122     setType = classmethod(setType)
 123     getType = classmethod(getType)
 124     displayTokens = classmethod(displayTokens)
 125     displayTokensWithPositions = classmethod(displayTokensWithPositions)
 126     displayTokensWithFullDetails = classmethod(displayTokensWithFullDetails)
 127     assertAnalyzesTo = classmethod(assertAnalyzesTo)
 128
 129
 130 if __name__ == "__main__":
 131     import sys
 132     AnalyzerUtils.main(sys.argv)