050fc5b49302d598f6bc66e01b23d26ebe4b4158
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2 from django.conf import settings
3 from lucene import SimpleFSDirectory, IndexWriter, File, Field, \
4     NumericField, Version, Document, JavaError, IndexSearcher, \
5     QueryParser, Term, PerFieldAnalyzerWrapper, \
6     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
7     KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
8     BlockJoinQuery, BlockJoinCollector, TermsFilter, \
9     HashSet, BooleanClause, Term
10     # KeywordAnalyzer
11 import os
12 import errno
13 from librarian import dcparser
14 from librarian.parser import WLDocument
15 import catalogue.models
16
17
18 class WLAnalyzer(PerFieldAnalyzerWrapper):
19     def __init__(self):
20         polish = PolishAnalyzer(Version.LUCENE_34)
21         simple = SimpleAnalyzer(Version.LUCENE_34)
22         keyword = KeywordAnalyzer(Version.LUCENE_34)
23         # not sure if needed: there's NOT_ANALYZED meaning basically the same
24
25         PerFieldAnalyzerWrapper.__init__(self, polish)
26
27         self.addAnalyzer("tags", simple)
28         self.addAnalyzer("technical_editors", simple)
29         self.addAnalyzer("editors", simple)
30         self.addAnalyzer("url", keyword)
31         self.addAnalyzer("source_url", keyword)
32         self.addAnalyzer("source_name", simple)
33         self.addAnalyzer("publisher", simple)
34         self.addAnalyzer("author", simple)
35         self.addAnalyzer("is_book", keyword)
36
37         #self.addanalyzer("fragment_anchor", keyword)
38
39
40 class IndexStore(object):
41     def __init__(self):
42         self.make_index_dir()
43         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
44
45     def make_index_dir(self):
46         try:
47             os.makedirs(settings.SEARCH_INDEX)
48         except OSError as exc:
49             if exc.errno == errno.EEXIST:
50                 pass
51             else: raise
52
53
54 class Index(IndexStore):
55     def __init__(self, analyzer=None):
56         IndexStore.__init__(self)
57         self.index = None
58         if not analyzer:
59             analyzer = WLAnalyzer()
60         self.analyzer = analyzer
61
62     def open(self, analyzer=None):
63         if self.index:
64             raise Exception("Index is already opened")
65         self.index = IndexWriter(self.store, self.analyzer,\
66                                  IndexWriter.MaxFieldLength.LIMITED)
67         return self.index
68
69     def close(self):
70         self.index.optimize()
71         self.index.close()
72         self.index = None
73
74     def remove_book(self, book):
75         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True,True)
76         self.index.deleteDocuments(q)
77
78     def index_book(self, book, overwrite=True):
79         if overwrite:
80             self.remove_book(book)
81
82         doc = self.extract_metadata(book)
83         parts = self.extract_content(book)
84         block = ArrayList().of_(Document)
85
86         for p in parts:
87             block.add(p)
88         block.add(doc)
89         self.index.addDocuments(block)
90
91     master_tags = [
92         'opowiadanie',
93         'powiesc',
94         'dramat_wierszowany_l',
95         'dramat_wierszowany_lp',
96         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
97         'wywiad'
98         ]
99
100     def create_book_doc(self, book):
101         """
102         Create a lucene document connected to the book
103         """
104         doc = Document()
105         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
106         if book.parent is not None:
107             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
108         return doc
109
110     def extract_metadata(self, book):
111         book_info = dcparser.parse(book.xml_file)
112
113         doc = self.create_book_doc(book)
114         doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
115         doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
116         doc.add(Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
117
118         # validator, name
119         for field in dcparser.BookInfo.FIELDS:
120             if hasattr(book_info, field.name):
121                 if not getattr(book_info, field.name):
122                     continue
123                 # since no type information is available, we use validator
124                 type_indicator = field.validator
125                 if type_indicator == dcparser.as_unicode:
126                     s = getattr(book_info, field.name)
127                     if field.multiple:
128                         s = ', '.join(s)
129                     try:
130                         doc.add(Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED))
131                     except JavaError as je:
132                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
133                 elif type_indicator == dcparser.as_person:
134                     p = getattr(book_info, field.name)
135                     if isinstance(p, dcparser.Person):
136                         persons = unicode(p)
137                     else:
138                         persons = ', '.join(map(unicode, p))
139                     doc.add(Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED))
140                 elif type_indicator == dcparser.as_date:
141                     dt = getattr(book_info, field.name)
142                     doc.add(Field(field.name, "%04d%02d%02d" % (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED))
143         return doc
144
145     def get_master(self, root):
146         for master in root.iter():
147             if master.tag in self.master_tags:
148                 return master
149
150     def extract_content(self, book):
151         wld = WLDocument.from_file(book.xml_file.path)
152         root = wld.edoc.getroot()
153
154         # first we build a sequence of top-level items.
155         # book_id
156         # header_index - the 0-indexed position of header element.
157         # content
158         master = self.get_master(root)
159         if master is None:
160             return []
161         
162         header_docs = []
163         for header, position in zip(list(master), range(len(master))):
164             doc = self.create_book_doc(book)
165             doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
166             doc.add(Field("header_type", header.tag, Field.Store.YES, Field.Index.NOT_ANALYZED))
167             content = u' '.join([t for t in header.itertext()])
168             doc.add(Field("content", content, Field.Store.NO, Field.Index.ANALYZED))
169             header_docs.append(doc)
170
171         def walker(node):
172             yield node, None
173             for child in list(node):
174                 for b, e in walker(child):
175                     yield b, e
176             yield None, node
177             return
178
179         # Then we create a document for each fragments
180         # fragment_anchor - the anchor
181         # themes - list of themes [not indexed]
182         fragment_docs = []
183         # will contain (framgent id -> { content: [], themes: [] }
184         fragments = {}
185         for start, end in walker(master):
186             if start is not None and start.tag == 'begin':
187                 fid = start.attrib['id'][1:]
188                 fragments[fid] = {'content': [], 'themes': []}
189                 fragments[fid]['content'].append(start.tail)
190             elif start is not None and start.tag == 'motyw':
191                 fid = start.attrib['id'][1:]
192                 fragments[fid]['themes'].append(start.text)
193                 fragments[fid]['content'].append(start.tail)
194             elif start is not None and start.tag == 'end':
195                 fid = start.attrib['id'][1:]
196                 if fid not in fragments:
197                     continue  # a broken <end> node, skip it
198                 frag = fragments[fid]
199                 del fragments[fid]
200
201                 def jstr(l):
202                     return u' '.join(map(
203                         lambda x: x == None and u'(none)' or unicode(x),
204                         l))
205                 s = u"Fragment %s complete, themes: %s contents: %s" % \
206                       (fid, jstr(frag['themes']), jstr(frag['content']))
207                 print(s.encode('utf-8'))
208
209                 doc = self.create_book_doc(book)
210                 doc.add(Field("fragment_anchor", fid,
211                               Field.Store.YES, Field.Index.NOT_ANALYZED))
212                 doc.add(Field("content",
213                               u' '.join(filter(lambda s: s is not None, frag['content'])),
214                               Field.Store.NO, Field.Index.ANALYZED))
215                 doc.add(Field("themes",
216                               u' '.join(filter(lambda s: s is not None, frag['themes'])),
217                               Field.Store.NO, Field.Index.ANALYZED))
218
219                 fragment_docs.append(doc)
220             elif start is not None:
221                 for frag in fragments.values():
222                     frag['content'].append(start.text)
223             elif end is not None:
224                 for frag in fragments.values():
225                     frag['content'].append(end.tail)
226
227         return header_docs + fragment_docs
228
229     def __enter__(self):
230         self.open()
231         return self
232
233     def __exit__(self, type, value, tb):
234         self.close()
235
236
237 class Search(IndexStore):
238     def __init__(self, default_field="content"):
239         IndexStore.__init__(self)
240         self.analyzer = PolishAnalyzer(Version.LUCENE_34)
241         ## self.analyzer = WLAnalyzer()
242         self.searcher = IndexSearcher(self.store, True)
243         self.parser = QueryParser(Version.LUCENE_34, default_field,
244                                   self.analyzer)
245
246         self.parent_filter = TermsFilter()
247         self.parent_filter.addTerm(Term("is_book", "true"))
248
249     def query(self, query):
250         return self.parser.parse(query)
251
252     def wrapjoins(self, query, fields=[]):
253         """
254         This functions modifies the query in a recursive way,
255         so Term and Phrase Queries contained, which match
256         provided fields are wrapped in a BlockJoinQuery,
257         and so delegated to children documents.
258         """
259         if BooleanQuery.instance_(query):
260             qs = BooleanQuery.cast_(query)
261             for clause in qs:
262                 clause = BooleanClause.cast_(clause)
263                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
264             return qs
265         else:
266             termset = HashSet()
267             query.extractTerms(termset)
268             for t in termset:
269                 t = Term.cast_(t)
270                 if t.field() not in fields:
271                     return query
272             return BlockJoinQuery(query, self.parent_filter,
273                                   BlockJoinQuery.ScoreMode.Total)
274
275     def simple_search(self, query, max_results=50):
276         """Returns (books, total_hits)
277         """
278
279         tops = self.searcher.search(self.query(query), max_results)
280         bks = []
281         for found in tops.scoreDocs:
282             doc = self.searcher.doc(found.doc)
283             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
284         return (bks, tops.totalHits)
285
286     def search(self, query, max_results=50):
287         query = self.query(query)
288         query = self.wrapjoins(query, ["content", "themes"])
289
290         tops = self.searcher.search(query, max_results)
291         bks = []
292         for found in tops.scoreDocs:
293             doc = self.searcher.doc(found.doc)
294             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
295         return (bks, tops.totalHits)
296
297     def bsearch(self, query, max_results=50):
298         q = self.query(query)
299         f = TermsFilter()
300         f.addTerm(Term("is_book", "true"))
301         bjq = BlockJoinQuery(q, f, BlockJoinQuery.ScoreMode.Avg)
302
303         tops = self.searcher.search(bjq, max_results)
304         bks = []
305         for found in tops.scoreDocs:
306             doc = self.searcher.doc(found.doc)
307             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
308         return (bks, tops.totalHits)