1 # -*- coding: utf-8 -*-
2 from django.conf import settings
3 from lucene import SimpleFSDirectory, IndexWriter, File, Field, \
4 NumericField, Version, Document, JavaError, IndexSearcher, \
5 QueryParser, Term, PerFieldAnalyzerWrapper, \
6 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
7 KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
8 BlockJoinQuery, BlockJoinCollector, TermsFilter, \
9 HashSet, BooleanClause, Term
13 from librarian import dcparser
14 from librarian.parser import WLDocument
15 import catalogue.models
18 class WLAnalyzer(PerFieldAnalyzerWrapper):
20 polish = PolishAnalyzer(Version.LUCENE_34)
21 simple = SimpleAnalyzer(Version.LUCENE_34)
22 keyword = KeywordAnalyzer(Version.LUCENE_34)
23 # not sure if needed: there's NOT_ANALYZED meaning basically the same
25 PerFieldAnalyzerWrapper.__init__(self, polish)
27 self.addAnalyzer("tags", simple)
28 self.addAnalyzer("technical_editors", simple)
29 self.addAnalyzer("editors", simple)
30 self.addAnalyzer("url", keyword)
31 self.addAnalyzer("source_url", keyword)
32 self.addAnalyzer("source_name", simple)
33 self.addAnalyzer("publisher", simple)
34 self.addAnalyzer("author", simple)
35 self.addAnalyzer("is_book", keyword)
37 #self.addanalyzer("fragment_anchor", keyword)
40 class IndexStore(object):
43 self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
45 def make_index_dir(self):
47 os.makedirs(settings.SEARCH_INDEX)
48 except OSError as exc:
49 if exc.errno == errno.EEXIST:
54 class Index(IndexStore):
55 def __init__(self, analyzer=None):
56 IndexStore.__init__(self)
59 analyzer = WLAnalyzer()
60 self.analyzer = analyzer
62 def open(self, analyzer=None):
64 raise Exception("Index is already opened")
65 self.index = IndexWriter(self.store, self.analyzer,\
66 IndexWriter.MaxFieldLength.LIMITED)
74 def remove_book(self, book):
75 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True,True)
76 self.index.deleteDocuments(q)
78 def index_book(self, book, overwrite=True):
80 self.remove_book(book)
82 doc = self.extract_metadata(book)
83 parts = self.extract_content(book)
84 block = ArrayList().of_(Document)
89 self.index.addDocuments(block)
94 'dramat_wierszowany_l',
95 'dramat_wierszowany_lp',
96 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
100 def create_book_doc(self, book):
102 Create a lucene document connected to the book
105 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
106 if book.parent is not None:
107 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
110 def extract_metadata(self, book):
111 book_info = dcparser.parse(book.xml_file)
113 doc = self.create_book_doc(book)
114 doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
115 doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
116 doc.add(Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
119 for field in dcparser.BookInfo.FIELDS:
120 if hasattr(book_info, field.name):
121 if not getattr(book_info, field.name):
123 # since no type information is available, we use validator
124 type_indicator = field.validator
125 if type_indicator == dcparser.as_unicode:
126 s = getattr(book_info, field.name)
130 doc.add(Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED))
131 except JavaError as je:
132 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
133 elif type_indicator == dcparser.as_person:
134 p = getattr(book_info, field.name)
135 if isinstance(p, dcparser.Person):
138 persons = ', '.join(map(unicode, p))
139 doc.add(Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED))
140 elif type_indicator == dcparser.as_date:
141 dt = getattr(book_info, field.name)
142 doc.add(Field(field.name, "%04d%02d%02d" % (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED))
145 def get_master(self, root):
146 for master in root.iter():
147 if master.tag in self.master_tags:
150 def extract_content(self, book):
151 wld = WLDocument.from_file(book.xml_file.path)
152 root = wld.edoc.getroot()
154 # first we build a sequence of top-level items.
156 # header_index - the 0-indexed position of header element.
158 master = self.get_master(root)
163 for header, position in zip(list(master), range(len(master))):
164 doc = self.create_book_doc(book)
165 doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
166 doc.add(Field("header_type", header.tag, Field.Store.YES, Field.Index.NOT_ANALYZED))
167 content = u' '.join([t for t in header.itertext()])
168 doc.add(Field("content", content, Field.Store.NO, Field.Index.ANALYZED))
169 header_docs.append(doc)
173 for child in list(node):
174 for b, e in walker(child):
179 # Then we create a document for each fragments
180 # fragment_anchor - the anchor
181 # themes - list of themes [not indexed]
183 # will contain (framgent id -> { content: [], themes: [] }
185 for start, end in walker(master):
186 if start is not None and start.tag == 'begin':
187 fid = start.attrib['id'][1:]
188 fragments[fid] = {'content': [], 'themes': []}
189 fragments[fid]['content'].append(start.tail)
190 elif start is not None and start.tag == 'motyw':
191 fid = start.attrib['id'][1:]
192 fragments[fid]['themes'].append(start.text)
193 fragments[fid]['content'].append(start.tail)
194 elif start is not None and start.tag == 'end':
195 fid = start.attrib['id'][1:]
196 if fid not in fragments:
197 continue # a broken <end> node, skip it
198 frag = fragments[fid]
202 return u' '.join(map(
203 lambda x: x == None and u'(none)' or unicode(x),
205 s = u"Fragment %s complete, themes: %s contents: %s" % \
206 (fid, jstr(frag['themes']), jstr(frag['content']))
207 print(s.encode('utf-8'))
209 doc = self.create_book_doc(book)
210 doc.add(Field("fragment_anchor", fid,
211 Field.Store.YES, Field.Index.NOT_ANALYZED))
212 doc.add(Field("content",
213 u' '.join(filter(lambda s: s is not None, frag['content'])),
214 Field.Store.NO, Field.Index.ANALYZED))
215 doc.add(Field("themes",
216 u' '.join(filter(lambda s: s is not None, frag['themes'])),
217 Field.Store.NO, Field.Index.ANALYZED))
219 fragment_docs.append(doc)
220 elif start is not None:
221 for frag in fragments.values():
222 frag['content'].append(start.text)
223 elif end is not None:
224 for frag in fragments.values():
225 frag['content'].append(end.tail)
227 return header_docs + fragment_docs
233 def __exit__(self, type, value, tb):
237 class Search(IndexStore):
238 def __init__(self, default_field="content"):
239 IndexStore.__init__(self)
240 self.analyzer = PolishAnalyzer(Version.LUCENE_34)
241 ## self.analyzer = WLAnalyzer()
242 self.searcher = IndexSearcher(self.store, True)
243 self.parser = QueryParser(Version.LUCENE_34, default_field,
246 self.parent_filter = TermsFilter()
247 self.parent_filter.addTerm(Term("is_book", "true"))
249 def query(self, query):
250 return self.parser.parse(query)
252 def wrapjoins(self, query, fields=[]):
254 This functions modifies the query in a recursive way,
255 so Term and Phrase Queries contained, which match
256 provided fields are wrapped in a BlockJoinQuery,
257 and so delegated to children documents.
259 if BooleanQuery.instance_(query):
260 qs = BooleanQuery.cast_(query)
262 clause = BooleanClause.cast_(clause)
263 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
267 query.extractTerms(termset)
270 if t.field() not in fields:
272 return BlockJoinQuery(query, self.parent_filter,
273 BlockJoinQuery.ScoreMode.Total)
275 def simple_search(self, query, max_results=50):
276 """Returns (books, total_hits)
279 tops = self.searcher.search(self.query(query), max_results)
281 for found in tops.scoreDocs:
282 doc = self.searcher.doc(found.doc)
283 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
284 return (bks, tops.totalHits)
286 def search(self, query, max_results=50):
287 query = self.query(query)
288 query = self.wrapjoins(query, ["content", "themes"])
290 tops = self.searcher.search(query, max_results)
292 for found in tops.scoreDocs:
293 doc = self.searcher.doc(found.doc)
294 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
295 return (bks, tops.totalHits)
297 def bsearch(self, query, max_results=50):
298 q = self.query(query)
300 f.addTerm(Term("is_book", "true"))
301 bjq = BlockJoinQuery(q, f, BlockJoinQuery.ScoreMode.Avg)
303 tops = self.searcher.search(bjq, max_results)
305 for found in tops.scoreDocs:
306 doc = self.searcher.doc(found.doc)
307 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
308 return (bks, tops.totalHits)