samples/TermPositionVector.py

   1 from lucene import \
   2     StandardAnalyzer, RAMDirectory, Document, Field, Version, \
   3     IndexWriter, IndexReader, TermPositionVector, initVM
   4
   5 if __name__ == '__main__':
   6     initVM()
   7
   8 directory = RAMDirectory()
   9 iwriter = IndexWriter(directory, StandardAnalyzer(Version.LUCENE_CURRENT),
  10                       True, IndexWriter.MaxFieldLength.LIMITED)
  11 ts = ["this bernhard is the text to be index text",
  12       "this claudia is the text to be index"]
  13 for t in ts:
  14     doc = Document()
  15     doc.add(Field("fieldname", t,
  16                   Field.Store.YES, Field.Index.ANALYZED,
  17                   Field.TermVector.WITH_POSITIONS_OFFSETS))
  18     iwriter.addDocument(doc)
  19 iwriter.optimize()
  20 iwriter.close()
  21
  22 ireader = IndexReader.open(directory, True)
  23 tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname'))
  24
  25 for (t,f,i) in zip(tpv.getTerms(),tpv.getTermFrequencies(),xrange(100000)):
  26     print 'term %s' % t
  27     print '  freq: %i' % f
  28     try:
  29         print '  pos: ' + str([p for p in tpv.getTermPositions(i)])
  30     except:
  31         print '  no pos'
  32     try:
  33         print '  off: ' + \
  34               str(["%i-%i" % (o.getStartOffset(), o.getEndOffset())
  35                    for o in tpv.getOffsets(i)])
  36     except:
  37         print '  no offsets'