Index has proper tokens.
[wolnelektury.git] / apps / search / index.py
1
2 from django.conf import settings
3 from lucene import SimpleFSDirectory, IndexWriter, File, Field, NumericField, \
4     Version, Document, JavaError, IndexSearcher, QueryParser, Term, PerFieldAnalyzerWrapper, \
5     SimpleAnalyzer, PolishAnalyzer, ArrayList, KeywordAnalyzer, NumericRangeQuery
6     # KeywordAnalyzer
7 import os
8 import errno
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from catalogue.models import Book
12
13
14 class WLAnalyzer(PerFieldAnalyzerWrapper):
15     def __init__(self):
16         polish = PolishAnalyzer(Version.LUCENE_34)
17         simple = SimpleAnalyzer(Version.LUCENE_34)
18         keyword = KeywordAnalyzer(Version.LUCENE_34)
19         # not sure if needed: there's NOT_ANALYZED meaning basically the same
20
21         PerFieldAnalyzerWrapper.__init__(self, polish)
22
23         self.addAnalyzer("tags", simple)
24         self.addAnalyzer("technical_editors", simple)
25         self.addAnalyzer("editors", simple)
26         self.addAnalyzer("url", keyword)
27         self.addAnalyzer("source_url", keyword)
28         self.addAnalyzer("source_name", simple)
29         self.addAnalyzer("publisher", simple)
30         self.addAnalyzer("author", simple)
31
32         #self.addanalyzer("fragment_anchor", keyword)
33
34
35 class IndexStore(object):
36     def __init__(self):
37         self.make_index_dir()
38         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
39
40     def make_index_dir(self):
41         try:
42             os.makedirs(settings.SEARCH_INDEX)
43         except OSError as exc:
44             if exc.errno == errno.EEXIST:
45                 pass
46             else: raise
47
48
49 class Index(IndexStore):
50     def __init__(self, analyzer=None):
51         IndexStore.__init__(self)
52         self.index = None
53         if not analyzer:
54             analyzer = WLAnalyzer()
55         self.analyzer = analyzer
56
57     def open(self, analyzer=None):
58         if self.index:
59             raise Exception("Index is already opened")
60         self.index = IndexWriter(self.store, self.analyzer, IndexWriter.MaxFieldLength.LIMITED)
61         return self.index
62
63     def close(self):
64         self.index.optimize()
65         self.index.close()
66         self.index = None
67
68     def remove_book(self, book):
69         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True,True)
70         self.index.deleteDocuments(q)
71         
72     def index_book(self, book, overwrite=True):
73         if overwrite:
74             self.remove_book(book)
75
76         doc = self.extract_metadata(book)
77         parts = self.extract_content(book)
78         block = ArrayList().of_(Document)
79         
80         try:
81             self.index.addDocument(doc)
82             for p in parts:
83                 self.index.addDocument(p)                
84         except JavaError as e:
85             import nose.tools; nose.tools.set_trace()
86
87             #block.add(p)
88             #self.index.addDocuments(block)
89             
90             #        import nose.tools; nose.tools.set_trace()
91             #block.add(doc)
92
93         #        self.index.addDocuments(block)
94
95     master_tags = [
96         'opowiadanie',
97         'powiesc',
98         'dramat_wierszowany_l',
99         'dramat_wierszowany_lp',
100         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
101         'wywiad'
102         ]
103
104     def create_book_doc(self, book):
105         """
106         Create a lucene document connected to the book
107         """
108         doc = Document()
109         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
110         if book.parent is not None:
111             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
112         return doc
113
114     def extract_metadata(self, book):
115         book_info = dcparser.parse(book.xml_file)
116
117         doc = self.create_book_doc(book)
118         doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
119         doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
120
121         # validator, name
122         for field in dcparser.BookInfo.FIELDS:
123             if hasattr(book_info, field.name):
124                 if not getattr(book_info, field.name):
125                     continue
126                 # since no type information is available, we use validator
127                 type_indicator = field.validator
128                 if type_indicator == dcparser.as_unicode:
129                     s = getattr(book_info, field.name)
130                     if field.multiple:
131                         s = ', '.join(s)
132                     try:
133                         doc.add(Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED))
134                     except JavaError as je:
135                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
136                 elif type_indicator == dcparser.as_person:
137                     p = getattr(book_info, field.name)
138                     if isinstance(p, dcparser.Person):
139                         persons = unicode(p)
140                     else:
141                         persons = ', '.join(map(unicode, p))
142                     doc.add(Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED))
143                 elif type_indicator == dcparser.as_date:
144                     dt = getattr(book_info, field.name)
145                     doc.add(Field(field.name, "%04d%02d%02d" % (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED))
146         return doc
147
148     def get_master(self, root):
149         for master in root.iter():
150             if master.tag in self.master_tags:
151                 return master
152
153     def extract_content(self, book):
154         wld = WLDocument.from_file(book.xml_file.path)
155         root = wld.edoc.getroot()
156
157         # first we build a sequence of top-level items.
158         # book_id
159         # header_index - the 0-indexed position of header element.
160         # content
161         master = self.get_master(root)
162         header_docs = []
163         for header, position in zip(list(master), range(len(master))):
164             print("header %s @%d" % (header, position))
165             doc = self.create_book_doc(book)
166             doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
167             content = u' '.join([t for t in header.itertext()])
168             doc.add(Field("content", content, Field.Store.NO, Field.Index.ANALYZED))
169             header_docs.append(doc)
170
171         def walker(node):
172             yield node, None
173             for child in list(node):
174                 for b, e in walker(child):
175                     yield b, e
176             yield None, node
177             return
178
179         # Then we create a document for each fragments
180         # fragment_anchor - the anchor
181         # themes - list of themes [not indexed]
182         fragment_docs = []
183         # will contain (framgent id -> { content: [], themes: [] }
184         fragments = {}
185         for start, end in walker(master):
186             print("%s %s" % (start, end))
187             if start is not None and start.tag == 'begin':
188                 fid = start.attrib['id'][1:]
189                 fragments[fid] = {'content': [], 'themes': []}
190                 fragments[fid]['content'].append(start.tail)
191             elif start is not None and start.tag == 'motyw':
192                 fid = start.attrib['id'][1:]
193                 fragments[fid]['themes'].append(start.text)
194                 fragments[fid]['content'].append(start.tail)
195             elif start is not None and start.tag == 'end':
196                 fid = start.attrib['id'][1:]
197                 frag = fragments[fid]
198                 del fragments[fid]
199                 print("Fragment %s complete, themes: %s contents: %s" % (fid, frag['themes'], frag['content']))
200
201                 doc = self.create_book_doc(book)
202                 doc.add(Field("fragment_anchor", fid, Field.Store.YES, Field.Index.NOT_ANALYZED))
203                 doc.add(Field("content", u' '.join(filter(lambda s: s is not None, frag['content'])), Field.Store.NO, Field.Index.ANALYZED))
204                 doc.add(Field("themes", u' '.join(frag['themes']), Field.Store.NO, Field.Index.ANALYZED))
205                 fragment_docs.append(doc)
206             elif start is not None:
207                 for frag in fragments.values():
208                     frag['content'].append(start.text)
209             elif end is not None:
210                 for frag in fragments.values():
211                     frag['content'].append(end.tail)
212
213         return header_docs + fragment_docs
214
215     def __enter__(self):
216         self.open()
217         return self
218
219     def __exit__(self, type, value, tb):
220         self.close()
221
222
223 class Search(IndexStore):
224     def __init__(self, default_field="content"):
225         IndexStore.__init__(self)
226         self.analyzer = PolishAnalyzer(Version.LUCENE_34)
227         ## self.analyzer = WLAnalyzer()
228         self.searcher = IndexSearcher(self.store, True)
229         self.parser = QueryParser(Version.LUCENE_34, default_field, self.analyzer)
230
231     def query(self, query):
232         return self.parser.parse(query)
233
234     def search(self, query, max_results=50):
235         """Returns (books, total_hits)
236         """
237
238         tops = self.searcher.search(self.query(query), max_results)
239         bks = []
240         for found in tops.scoreDocs:
241             doc = self.searcher.doc(found.doc)
242             bks.append(Book.objects.get(id=doc.get("book_id")))
243         return (bks, tops.totalHits)
244