2 from django.conf import settings
3 from lucene import SimpleFSDirectory, IndexWriter, File, Field, NumericField, \
4 Version, Document, JavaError, IndexSearcher, QueryParser, Term, PerFieldAnalyzerWrapper, \
5 SimpleAnalyzer, PolishAnalyzer, ArrayList, KeywordAnalyzer, NumericRangeQuery
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from catalogue.models import Book
14 class WLAnalyzer(PerFieldAnalyzerWrapper):
16 polish = PolishAnalyzer(Version.LUCENE_34)
17 simple = SimpleAnalyzer(Version.LUCENE_34)
18 keyword = KeywordAnalyzer(Version.LUCENE_34)
19 # not sure if needed: there's NOT_ANALYZED meaning basically the same
21 PerFieldAnalyzerWrapper.__init__(self, polish)
23 self.addAnalyzer("tags", simple)
24 self.addAnalyzer("technical_editors", simple)
25 self.addAnalyzer("editors", simple)
26 self.addAnalyzer("url", keyword)
27 self.addAnalyzer("source_url", keyword)
28 self.addAnalyzer("source_name", simple)
29 self.addAnalyzer("publisher", simple)
30 self.addAnalyzer("author", simple)
32 #self.addanalyzer("fragment_anchor", keyword)
35 class IndexStore(object):
38 self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
40 def make_index_dir(self):
42 os.makedirs(settings.SEARCH_INDEX)
43 except OSError as exc:
44 if exc.errno == errno.EEXIST:
49 class Index(IndexStore):
50 def __init__(self, analyzer=None):
51 IndexStore.__init__(self)
54 analyzer = WLAnalyzer()
55 self.analyzer = analyzer
57 def open(self, analyzer=None):
59 raise Exception("Index is already opened")
60 self.index = IndexWriter(self.store, self.analyzer, IndexWriter.MaxFieldLength.LIMITED)
68 def remove_book(self, book):
69 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True,True)
70 self.index.deleteDocuments(q)
72 def index_book(self, book, overwrite=True):
74 self.remove_book(book)
76 doc = self.extract_metadata(book)
77 parts = self.extract_content(book)
78 block = ArrayList().of_(Document)
81 self.index.addDocument(doc)
83 self.index.addDocument(p)
84 except JavaError as e:
85 import nose.tools; nose.tools.set_trace()
88 #self.index.addDocuments(block)
90 # import nose.tools; nose.tools.set_trace()
93 # self.index.addDocuments(block)
98 'dramat_wierszowany_l',
99 'dramat_wierszowany_lp',
100 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
104 def create_book_doc(self, book):
106 Create a lucene document connected to the book
109 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
110 if book.parent is not None:
111 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
114 def extract_metadata(self, book):
115 book_info = dcparser.parse(book.xml_file)
117 doc = self.create_book_doc(book)
118 doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
119 doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
122 for field in dcparser.BookInfo.FIELDS:
123 if hasattr(book_info, field.name):
124 if not getattr(book_info, field.name):
126 # since no type information is available, we use validator
127 type_indicator = field.validator
128 if type_indicator == dcparser.as_unicode:
129 s = getattr(book_info, field.name)
133 doc.add(Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED))
134 except JavaError as je:
135 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
136 elif type_indicator == dcparser.as_person:
137 p = getattr(book_info, field.name)
138 if isinstance(p, dcparser.Person):
141 persons = ', '.join(map(unicode, p))
142 doc.add(Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED))
143 elif type_indicator == dcparser.as_date:
144 dt = getattr(book_info, field.name)
145 doc.add(Field(field.name, "%04d%02d%02d" % (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED))
148 def get_master(self, root):
149 for master in root.iter():
150 if master.tag in self.master_tags:
153 def extract_content(self, book):
154 wld = WLDocument.from_file(book.xml_file.path)
155 root = wld.edoc.getroot()
157 # first we build a sequence of top-level items.
159 # header_index - the 0-indexed position of header element.
161 master = self.get_master(root)
163 for header, position in zip(list(master), range(len(master))):
164 print("header %s @%d" % (header, position))
165 doc = self.create_book_doc(book)
166 doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
167 content = u' '.join([t for t in header.itertext()])
168 doc.add(Field("content", content, Field.Store.NO, Field.Index.ANALYZED))
169 header_docs.append(doc)
173 for child in list(node):
174 for b, e in walker(child):
179 # Then we create a document for each fragments
180 # fragment_anchor - the anchor
181 # themes - list of themes [not indexed]
183 # will contain (framgent id -> { content: [], themes: [] }
185 for start, end in walker(master):
186 print("%s %s" % (start, end))
187 if start is not None and start.tag == 'begin':
188 fid = start.attrib['id'][1:]
189 fragments[fid] = {'content': [], 'themes': []}
190 fragments[fid]['content'].append(start.tail)
191 elif start is not None and start.tag == 'motyw':
192 fid = start.attrib['id'][1:]
193 fragments[fid]['themes'].append(start.text)
194 fragments[fid]['content'].append(start.tail)
195 elif start is not None and start.tag == 'end':
196 fid = start.attrib['id'][1:]
197 frag = fragments[fid]
199 print("Fragment %s complete, themes: %s contents: %s" % (fid, frag['themes'], frag['content']))
201 doc = self.create_book_doc(book)
202 doc.add(Field("fragment_anchor", fid, Field.Store.YES, Field.Index.NOT_ANALYZED))
203 doc.add(Field("content", u' '.join(filter(lambda s: s is not None, frag['content'])), Field.Store.NO, Field.Index.ANALYZED))
204 doc.add(Field("themes", u' '.join(frag['themes']), Field.Store.NO, Field.Index.ANALYZED))
205 fragment_docs.append(doc)
206 elif start is not None:
207 for frag in fragments.values():
208 frag['content'].append(start.text)
209 elif end is not None:
210 for frag in fragments.values():
211 frag['content'].append(end.tail)
213 return header_docs + fragment_docs
219 def __exit__(self, type, value, tb):
223 class Search(IndexStore):
224 def __init__(self, default_field="content"):
225 IndexStore.__init__(self)
226 self.analyzer = PolishAnalyzer(Version.LUCENE_34)
227 ## self.analyzer = WLAnalyzer()
228 self.searcher = IndexSearcher(self.store, True)
229 self.parser = QueryParser(Version.LUCENE_34, default_field, self.analyzer)
231 def query(self, query):
232 return self.parser.parse(query)
234 def search(self, query, max_results=50):
235 """Returns (books, total_hits)
238 tops = self.searcher.search(self.query(query), max_results)
240 for found in tops.scoreDocs:
241 doc = self.searcher.doc(found.doc)
242 bks.append(Book.objects.get(id=doc.get("book_id")))
243 return (bks, tops.totalHits)