94e6f099ca544013370cad278b030449ef3bac79
[wolnelektury.git] / apps / search / index.py
1
2 from django.conf import settings
3 from lucene import SimpleFSDirectory, IndexWriter, File, Field, NumericField, PolishAnalyzer, \
4     Version, Document, JavaError, IndexSearcher, QueryParser, Term
5 import os
6 import errno
7 from librarian import dcparser
8 from catalogue.models import Book
9
10
11 class IndexStore(object):
12     def __init__(self):
13         self.make_index_dir()
14         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
15
16     def make_index_dir(self):
17         try:
18             os.makedirs(settings.SEARCH_INDEX)
19         except OSError as exc:
20             if exc.errno == errno.EEXIST:
21                 pass
22             else: raise
23
24
25 class Index(IndexStore):
26     def __init__(self):
27         IndexStore.__init__(self)
28         self.index = None
29
30     def open(self, analyzer=None):
31         if not analyzer:
32             analyzer = PolishAnalyzer(Version.LUCENE_34)
33         if self.index:
34             raise Exception("Index is already opened")
35         self.index = IndexWriter(self.store, analyzer, IndexWriter.MaxFieldLength.LIMITED)
36         return self.index
37
38     def close(self):
39         self.index.optimize()
40         self.index.close()
41
42     def index_book(self, book, overwrite=True):
43         book_info = dcparser.parse(book.xml_file)
44
45         if overwrite:
46             self.index.deleteDocuments(Term("id", str(book.id)))
47
48         doc = Document()
49         doc.add(NumericField("id", Field.Store.YES, True).setIntValue(book.id))
50         doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
51
52         # validator, name
53         for field in dcparser.BookInfo.FIELDS:
54             if hasattr(book_info, field.name):
55                 if not getattr(book_info, field.name):
56                     continue
57                 # since no type information is available, we use validator
58                 type_indicator = field.validator
59                 if type_indicator == dcparser.as_unicode:
60                     s = getattr(book_info, field.name)
61                     if field.multiple:
62                         s = ', '.join(s)
63                     try:
64                         doc.add(Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED))
65                     except JavaError as je:
66                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
67                 elif type_indicator == dcparser.as_person:
68                     p = getattr(book_info, field.name)
69                     if isinstance(p, dcparser.Person):
70                         persons = str(p)
71                     else:
72                         persons = ', '.join(map(str, p))
73                     doc.add(Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED))
74                 elif type_indicator == dcparser.as_date:
75                     dt = getattr(book_info, field.name)
76                     doc.add(Field(field.name, "%04d%02d%02d" % (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED))
77
78         self.index.addDocument(doc)
79
80     def __enter__(self):
81         self.open()
82         return self
83
84     def __exit__(self, type, value, tb):
85         self.close()
86
87
88 class Search(IndexStore):
89     def __init__(self, default_field="description"):
90         IndexStore.__init__(self)
91         self.analyzer = PolishAnalyzer(Version.LUCENE_34)
92         self.searcher = IndexSearcher(self.store, True)
93         self.parser = QueryParser(Version.LUCENE_34, default_field, self.analyzer)
94
95     def query(self, query):
96         return self.parser.parse(query)
97
98     def search(self, query, max_results=50):
99         """Returns (books, total_hits)
100         """
101
102         tops = self.searcher.search(self.query(query), max_results)
103         bks = []
104         for found in tops.scoreDocs:
105             doc = self.searcher.doc(found.doc)
106             bks.append(Book.objects.get(id=doc.get("id")))
107         return (bks, tops.totalHits)