Configured build for Ubuntu and added Stempel polish analyzer
[pylucene.git] / samples / LuceneInAction / lia / common / TestDataDocumentHandler.py
1 # ====================================================================
2 #   Licensed under the Apache License, Version 2.0 (the "License");
3 #   you may not use this file except in compliance with the License.
4 #   You may obtain a copy of the License at
5 #
6 #       http://www.apache.org/licenses/LICENSE-2.0
7 #
8 #   Unless required by applicable law or agreed to in writing, software
9 #   distributed under the License is distributed on an "AS IS" BASIS,
10 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 #   See the License for the specific language governing permissions and
12 #   limitations under the License.
13 # ====================================================================
14
15 import os
16
17 from lucene import \
18     Document, Field, IndexWriter, StandardAnalyzer, NumericField, \
19     SimpleDateFormat, Version, SimpleFSDirectory, File, DateTools, DateField
20
21 # date culled from LuceneInAction.zip archive from Manning site
22 samplesModified = SimpleDateFormat('yyyy-MM-dd').parse('2004-12-02')
23
24
25 class TestDataDocumentHandler(object):
26
27     def createIndex(cls, dataDir, indexDir, useCompound):
28
29         indexDir = SimpleFSDirectory(File(indexDir))
30         writer = IndexWriter(indexDir,
31                              StandardAnalyzer(Version.LUCENE_CURRENT), True,
32                              IndexWriter.MaxFieldLength.UNLIMITED)
33         writer.setUseCompoundFile(useCompound)
34
35         for dir, dirnames, filenames in os.walk(dataDir):
36             for filename in filenames:
37                 if filename.endswith('.properties'):
38                     cls.indexFile(writer, os.path.join(dir, filename), dataDir)
39
40         writer.optimize()
41         writer.close()
42
43     def indexFile(cls, writer, path, baseDir):
44         
45         input = file(path)
46         props = {}
47         while True:
48             line = input.readline().strip()
49             if not line:
50                 break
51             name, value = line.split('=', 1)
52             props[name] = value.decode('unicode-escape')
53         input.close()
54
55         doc = Document()
56
57         # category comes from relative path below the base directory
58         category = os.path.dirname(path)[len(baseDir):]
59         if os.path.sep != '/':
60             category = category.replace(os.path.sep, '/')
61
62         isbn = props['isbn']
63         title = props['title']
64         author = props['author']
65         url = props['url']
66         subject = props['subject']
67         pubmonth = props['pubmonth']
68
69         print title.encode('utf8')
70         print author.encode('utf-8')
71         print subject.encode('utf-8')
72         print category.encode('utf-8')
73         print "---------"
74
75         doc.add(Field("isbn", isbn,
76                       Field.Store.YES, Field.Index.NOT_ANALYZED))
77         doc.add(Field("category", category,
78                       Field.Store.YES, Field.Index.NOT_ANALYZED))
79         doc.add(Field("title", title,
80                       Field.Store.YES, Field.Index.ANALYZED,
81                       Field.TermVector.WITH_POSITIONS_OFFSETS))
82         doc.add(Field("title2", title.lower(),
83                       Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS,
84                       Field.TermVector.WITH_POSITIONS_OFFSETS))
85
86         # split multiple authors into unique field instances
87         authors = author.split(',')
88         for a in authors:
89             doc.add(Field("author", a,
90                           Field.Store.YES, Field.Index.NOT_ANALYZED,
91                           Field.TermVector.WITH_POSITIONS_OFFSETS))
92
93         doc.add(Field("url", url,
94                       Field.Store.YES,
95                       Field.Index.NOT_ANALYZED_NO_NORMS))
96         doc.add(Field("subject", subject,
97                       Field.Store.NO, Field.Index.ANALYZED,
98                       Field.TermVector.WITH_POSITIONS_OFFSETS))
99         doc.add(NumericField("pubmonth",
100                              Field.Store.YES,
101                              True).setIntValue(int(pubmonth)))
102
103         d = DateTools.stringToDate(pubmonth)
104         d = int(d.getTime() / (1000 * 3600 * 24.0))
105         doc.add(NumericField("pubmonthAsDay").setIntValue(d))
106
107         doc.add(Field("contents", ' '.join([title, subject, author, category]),
108                       Field.Store.NO, Field.Index.ANALYZED,
109                       Field.TermVector.WITH_POSITIONS_OFFSETS))
110
111         doc.add(Field("path", path,
112                       Field.Store.YES, Field.Index.NOT_ANALYZED))
113         doc.add(Field("modified", DateField.dateToString(samplesModified),
114                       Field.Store.YES, Field.Index.NOT_ANALYZED))
115
116         writer.addDocument(doc)
117
118     createIndex = classmethod(createIndex)
119     indexFile = classmethod(indexFile)