1 # -*- coding: utf-8 -*-
2 from django.conf import settings
3 from lucene import SimpleFSDirectory, IndexWriter, File, Field, \
4 NumericField, Version, Document, JavaError, IndexSearcher, \
5 QueryParser, Term, PerFieldAnalyzerWrapper, \
6 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
7 KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
8 BlockJoinQuery, BlockJoinCollector, TermsFilter, \
9 HashSet, BooleanClause, Term
13 from librarian import dcparser
14 from librarian.parser import WLDocument
15 from catalogue.models import Book
18 class WLAnalyzer(PerFieldAnalyzerWrapper):
20 polish = PolishAnalyzer(Version.LUCENE_34)
21 simple = SimpleAnalyzer(Version.LUCENE_34)
22 keyword = KeywordAnalyzer(Version.LUCENE_34)
23 # not sure if needed: there's NOT_ANALYZED meaning basically the same
25 PerFieldAnalyzerWrapper.__init__(self, polish)
27 self.addAnalyzer("tags", simple)
28 self.addAnalyzer("technical_editors", simple)
29 self.addAnalyzer("editors", simple)
30 self.addAnalyzer("url", keyword)
31 self.addAnalyzer("source_url", keyword)
32 self.addAnalyzer("source_name", simple)
33 self.addAnalyzer("publisher", simple)
34 self.addAnalyzer("author", simple)
35 self.addAnalyzer("is_book", keyword)
37 #self.addanalyzer("fragment_anchor", keyword)
40 class IndexStore(object):
43 self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
45 def make_index_dir(self):
47 os.makedirs(settings.SEARCH_INDEX)
48 except OSError as exc:
49 if exc.errno == errno.EEXIST:
54 class Index(IndexStore):
55 def __init__(self, analyzer=None):
56 IndexStore.__init__(self)
59 analyzer = WLAnalyzer()
60 self.analyzer = analyzer
62 def open(self, analyzer=None):
64 raise Exception("Index is already opened")
65 self.index = IndexWriter(self.store, self.analyzer,\
66 IndexWriter.MaxFieldLength.LIMITED)
74 def remove_book(self, book):
75 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True,True)
76 self.index.deleteDocuments(q)
78 def index_book(self, book, overwrite=True):
80 self.remove_book(book)
82 doc = self.extract_metadata(book)
83 parts = self.extract_content(book)
84 block = ArrayList().of_(Document)
89 self.index.addDocuments(block)
94 'dramat_wierszowany_l',
95 'dramat_wierszowany_lp',
96 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
100 def create_book_doc(self, book):
102 Create a lucene document connected to the book
105 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
106 if book.parent is not None:
107 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
110 def extract_metadata(self, book):
111 book_info = dcparser.parse(book.xml_file)
113 doc = self.create_book_doc(book)
114 doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
115 doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
116 doc.add(Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
119 for field in dcparser.BookInfo.FIELDS:
120 if hasattr(book_info, field.name):
121 if not getattr(book_info, field.name):
123 # since no type information is available, we use validator
124 type_indicator = field.validator
125 if type_indicator == dcparser.as_unicode:
126 s = getattr(book_info, field.name)
130 doc.add(Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED))
131 except JavaError as je:
132 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
133 elif type_indicator == dcparser.as_person:
134 p = getattr(book_info, field.name)
135 if isinstance(p, dcparser.Person):
138 persons = ', '.join(map(unicode, p))
139 doc.add(Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED))
140 elif type_indicator == dcparser.as_date:
141 dt = getattr(book_info, field.name)
142 doc.add(Field(field.name, "%04d%02d%02d" % (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED))
145 def get_master(self, root):
146 for master in root.iter():
147 if master.tag in self.master_tags:
150 def extract_content(self, book):
151 wld = WLDocument.from_file(book.xml_file.path)
152 root = wld.edoc.getroot()
154 # first we build a sequence of top-level items.
156 # header_index - the 0-indexed position of header element.
158 master = self.get_master(root)
160 for header, position in zip(list(master), range(len(master))):
161 print("header %s @%d" % (header, position))
162 doc = self.create_book_doc(book)
163 doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
164 doc.add(Field("header_type", header.tag, Field.Store.YES, Field.Index.NOT_ANALYZED))
165 content = u' '.join([t for t in header.itertext()])
166 doc.add(Field("content", content, Field.Store.NO, Field.Index.ANALYZED))
167 header_docs.append(doc)
171 for child in list(node):
172 for b, e in walker(child):
177 # Then we create a document for each fragments
178 # fragment_anchor - the anchor
179 # themes - list of themes [not indexed]
181 # will contain (framgent id -> { content: [], themes: [] }
183 for start, end in walker(master):
184 print("%s %s" % (start, end))
185 if start is not None and start.tag == 'begin':
186 fid = start.attrib['id'][1:]
187 fragments[fid] = {'content': [], 'themes': []}
188 fragments[fid]['content'].append(start.tail)
189 elif start is not None and start.tag == 'motyw':
190 fid = start.attrib['id'][1:]
191 fragments[fid]['themes'].append(start.text)
192 fragments[fid]['content'].append(start.tail)
193 elif start is not None and start.tag == 'end':
194 fid = start.attrib['id'][1:]
195 frag = fragments[fid]
197 print("Fragment %s complete, themes: %s contents: %s" % (fid, frag['themes'], frag['content']))
199 doc = self.create_book_doc(book)
200 doc.add(Field("fragment_anchor", fid, Field.Store.YES, Field.Index.NOT_ANALYZED))
201 doc.add(Field("content", u' '.join(filter(lambda s: s is not None, frag['content'])), Field.Store.NO, Field.Index.ANALYZED))
202 doc.add(Field("themes", u' '.join(frag['themes']), Field.Store.NO, Field.Index.ANALYZED))
203 fragment_docs.append(doc)
204 elif start is not None:
205 for frag in fragments.values():
206 frag['content'].append(start.text)
207 elif end is not None:
208 for frag in fragments.values():
209 frag['content'].append(end.tail)
211 return header_docs + fragment_docs
217 def __exit__(self, type, value, tb):
221 class Search(IndexStore):
222 def __init__(self, default_field="content"):
223 IndexStore.__init__(self)
224 self.analyzer = PolishAnalyzer(Version.LUCENE_34)
225 ## self.analyzer = WLAnalyzer()
226 self.searcher = IndexSearcher(self.store, True)
227 self.parser = QueryParser(Version.LUCENE_34, default_field,
230 self.parent_filter = TermsFilter()
231 self.parent_filter.addTerm(Term("is_book", "true"))
233 def query(self, query):
234 return self.parser.parse(query)
236 def wrapjoins(self, query, fields=[]):
238 This functions modifies the query in a recursive way,
239 so Term and Phrase Queries contained, which match
240 provided fields are wrapped in a BlockJoinQuery,
241 and so delegated to children documents.
243 if BooleanQuery.instance_(query):
244 qs = BooleanQuery.cast_(query)
246 clause = BooleanClause.cast_(clause)
247 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
251 query.extractTerms(termset)
254 if t.field() not in fields:
256 return BlockJoinQuery(query, self.parent_filter,
257 BlockJoinQuery.ScoreMode.Total)
259 def simple_search(self, query, max_results=50):
260 """Returns (books, total_hits)
263 tops = self.searcher.search(self.query(query), max_results)
265 for found in tops.scoreDocs:
266 doc = self.searcher.doc(found.doc)
267 bks.append(Book.objects.get(id=doc.get("book_id")))
268 return (bks, tops.totalHits)
270 def search(self, query, max_results=50):
271 query = self.query(query)
272 query = self.wrapjoins(query, ["content", "themes"])
274 tops = self.searcher.search(query, max_results)
276 for found in tops.scoreDocs:
277 doc = self.searcher.doc(found.doc)
278 bks.append(Book.objects.get(id=doc.get("book_id")))
279 return (bks, tops.totalHits)
281 def bsearch(self, query, max_results=50):
282 q = self.query(query)
284 f.addTerm(Term("is_book", "true"))
285 bjq = BlockJoinQuery(q, f, BlockJoinQuery.ScoreMode.Avg)
287 tops = self.searcher.search(bjq, max_results)
289 for found in tops.scoreDocs:
290 doc = self.searcher.doc(found.doc)
291 bks.append(Book.objects.get(id=doc.get("book_id")))
292 return (bks, tops.totalHits)