samples/LuceneInAction/lia/common/TestDataDocumentHandler.py

   1 # ====================================================================
   2 #   Licensed under the Apache License, Version 2.0 (the "License");
   3 #   you may not use this file except in compliance with the License.
   4 #   You may obtain a copy of the License at
   5 #
   6 #       http://www.apache.org/licenses/LICENSE-2.0
   7 #
   8 #   Unless required by applicable law or agreed to in writing, software
   9 #   distributed under the License is distributed on an "AS IS" BASIS,
  10 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11 #   See the License for the specific language governing permissions and
  12 #   limitations under the License.
  13 # ====================================================================
  14
  15 import os
  16
  17 from lucene import \
  18     Document, Field, IndexWriter, StandardAnalyzer, NumericField, \
  19     SimpleDateFormat, Version, SimpleFSDirectory, File, DateTools, DateField
  20
  21 # date culled from LuceneInAction.zip archive from Manning site
  22 samplesModified = SimpleDateFormat('yyyy-MM-dd').parse('2004-12-02')
  23
  24
  25 class TestDataDocumentHandler(object):
  26
  27     def createIndex(cls, dataDir, indexDir, useCompound):
  28
  29         indexDir = SimpleFSDirectory(File(indexDir))
  30         writer = IndexWriter(indexDir,
  31                              StandardAnalyzer(Version.LUCENE_CURRENT), True,
  32                              IndexWriter.MaxFieldLength.UNLIMITED)
  33         writer.setUseCompoundFile(useCompound)
  34
  35         for dir, dirnames, filenames in os.walk(dataDir):
  36             for filename in filenames:
  37                 if filename.endswith('.properties'):
  38                     cls.indexFile(writer, os.path.join(dir, filename), dataDir)
  39
  40         writer.optimize()
  41         writer.close()
  42
  43     def indexFile(cls, writer, path, baseDir):
  44
  45         input = file(path)
  46         props = {}
  47         while True:
  48             line = input.readline().strip()
  49             if not line:
  50                 break
  51             name, value = line.split('=', 1)
  52             props[name] = value.decode('unicode-escape')
  53         input.close()
  54
  55         doc = Document()
  56
  57         # category comes from relative path below the base directory
  58         category = os.path.dirname(path)[len(baseDir):]
  59         if os.path.sep != '/':
  60             category = category.replace(os.path.sep, '/')
  61
  62         isbn = props['isbn']
  63         title = props['title']
  64         author = props['author']
  65         url = props['url']
  66         subject = props['subject']
  67         pubmonth = props['pubmonth']
  68
  69         print title.encode('utf8')
  70         print author.encode('utf-8')
  71         print subject.encode('utf-8')
  72         print category.encode('utf-8')
  73         print "---------"
  74
  75         doc.add(Field("isbn", isbn,
  76                       Field.Store.YES, Field.Index.NOT_ANALYZED))
  77         doc.add(Field("category", category,
  78                       Field.Store.YES, Field.Index.NOT_ANALYZED))
  79         doc.add(Field("title", title,
  80                       Field.Store.YES, Field.Index.ANALYZED,
  81                       Field.TermVector.WITH_POSITIONS_OFFSETS))
  82         doc.add(Field("title2", title.lower(),
  83                       Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS,
  84                       Field.TermVector.WITH_POSITIONS_OFFSETS))
  85
  86         # split multiple authors into unique field instances
  87         authors = author.split(',')
  88         for a in authors:
  89             doc.add(Field("author", a,
  90                           Field.Store.YES, Field.Index.NOT_ANALYZED,
  91                           Field.TermVector.WITH_POSITIONS_OFFSETS))
  92
  93         doc.add(Field("url", url,
  94                       Field.Store.YES,
  95                       Field.Index.NOT_ANALYZED_NO_NORMS))
  96         doc.add(Field("subject", subject,
  97                       Field.Store.NO, Field.Index.ANALYZED,
  98                       Field.TermVector.WITH_POSITIONS_OFFSETS))
  99         doc.add(NumericField("pubmonth",
 100                              Field.Store.YES,
 101                              True).setIntValue(int(pubmonth)))
 102
 103         d = DateTools.stringToDate(pubmonth)
 104         d = int(d.getTime() / (1000 * 3600 * 24.0))
 105         doc.add(NumericField("pubmonthAsDay").setIntValue(d))
 106
 107         doc.add(Field("contents", ' '.join([title, subject, author, category]),
 108                       Field.Store.NO, Field.Index.ANALYZED,
 109                       Field.TermVector.WITH_POSITIONS_OFFSETS))
 110
 111         doc.add(Field("path", path,
 112                       Field.Store.YES, Field.Index.NOT_ANALYZED))
 113         doc.add(Field("modified", DateField.dateToString(samplesModified),
 114                       Field.Store.YES, Field.Index.NOT_ANALYZED))
 115
 116         writer.addDocument(doc)
 117
 118     createIndex = classmethod(createIndex)
 119     indexFile = classmethod(indexFile)