fnp
/
wolnelektury.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
librarian bump
[wolnelektury.git]
/
apps
/
search
/
index.py
diff --git
a/apps/search/index.py
b/apps/search/index.py
index
312cf94
..
a0bf715
100644
(file)
--- a/
apps/search/index.py
+++ b/
apps/search/index.py
@@
-18,7
+18,7
@@
from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader,
# KeywordAnalyzer
# Initialize jvm
# KeywordAnalyzer
# Initialize jvm
-JVM = initVM(CLASSPATH)
+JVM = initVM(CLASSPATH
, maxheap=settings.JVM_MAXHEAP
)
import sys
import os
import sys
import os
@@
-317,14
+317,19
@@
class Index(BaseIndex):
doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
return doc
doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
return doc
- def remove_book(self, book, remove_snippets=True):
+ def remove_book(self, book
_or_id
, remove_snippets=True):
"""Removes a book from search index.
book - Book instance."""
"""Removes a book from search index.
book - Book instance."""
- q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
+ if isinstance(book_or_id, catalogue.models.Book):
+ book_id = book_or_id.id
+ else:
+ book_id = book_or_id
+
+ q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True)
self.index.deleteDocuments(q)
if remove_snippets:
self.index.deleteDocuments(q)
if remove_snippets:
- snippets = Snippets(book
.
id)
+ snippets = Snippets(book
_
id)
snippets.remove()
def index_book(self, book, book_info=None, overwrite=True):
snippets.remove()
def index_book(self, book, book_info=None, overwrite=True):
@@
-339,7
+344,11
@@
class Index(BaseIndex):
self.remove_book(book, remove_snippets=False)
book_doc = self.create_book_doc(book)
self.remove_book(book, remove_snippets=False)
book_doc = self.create_book_doc(book)
- meta_fields = self.extract_metadata(book, book_info)
+ meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
+ # let's not index it - it's only used for extracting publish date
+ if 'source_name' in meta_fields:
+ del meta_fields['source_name']
+
for f in meta_fields.values():
if isinstance(f, list) or isinstance(f, tuple):
for elem in f:
for f in meta_fields.values():
if isinstance(f, list) or isinstance(f, tuple):
for elem in f:
@@
-373,7
+382,7
@@
class Index(BaseIndex):
published_date_re = re.compile("([0-9]+)[\]. ]*$")
published_date_re = re.compile("([0-9]+)[\]. ]*$")
- def extract_metadata(self, book, book_info=None):
+ def extract_metadata(self, book, book_info=None
, dc_only=None
):
"""
Extract metadata from book and returns a map of fields keyed by fieldname
"""
"""
Extract metadata from book and returns a map of fields keyed by fieldname
"""
@@
-388,6
+397,8
@@
class Index(BaseIndex):
# validator, name
for field in dcparser.BookInfo.FIELDS:
# validator, name
for field in dcparser.BookInfo.FIELDS:
+ if dc_only and field.name not in dc_only:
+ continue
if hasattr(book_info, field.name):
if not getattr(book_info, field.name):
continue
if hasattr(book_info, field.name):
if not getattr(book_info, field.name):
continue
@@
-1055,7
+1066,8
@@
class Search(IndexStore):
return toks
return toks
- def fuzziness(self, fuzzy):
+ @staticmethod
+ def fuzziness(fuzzy):
"""Helper method to sanitize fuzziness"""
if not fuzzy:
return None
"""Helper method to sanitize fuzziness"""
if not fuzzy:
return None
@@
-1092,7
+1104,8
@@
class Search(IndexStore):
phrase.add(term)
return phrase
phrase.add(term)
return phrase
- def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
+ @staticmethod
+ def make_term_query(tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
"""
Returns term queries joined by boolean query.
modal - applies to boolean query
"""
Returns term queries joined by boolean query.
modal - applies to boolean query
@@
-1371,13
+1384,13
@@
class Search(IndexStore):
if terms:
return JArray('object')(terms, Term)
if terms:
return JArray('object')(terms, Term)
- def search_tags(self, query, filt
er
=None, max_results=40, pdcounter=False):
+ def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
"""
Search for Tag objects using query.
"""
if not pdcounter:
"""
Search for Tag objects using query.
"""
if not pdcounter:
- filters = self.chain_filters([filt
er
, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
- tops = self.searcher.search(query, filt
er
, max_results)
+ filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
+ tops = self.searcher.search(query, filt, max_results)
tags = []
for found in tops.scoreDocs:
tags = []
for found in tops.scoreDocs:
@@
-1396,8
+1409,9
@@
class Search(IndexStore):
else:
tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
# don't add the pdcounter tag if same tag already exists
else:
tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
# don't add the pdcounter tag if same tag already exists
- if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
- tags.append(tag)
+
+ tags.append(tag)
+
except catalogue.models.Tag.DoesNotExist: pass
except PDCounterAuthor.DoesNotExist: pass
except PDCounterBook.DoesNotExist: pass
except catalogue.models.Tag.DoesNotExist: pass
except PDCounterAuthor.DoesNotExist: pass
except PDCounterBook.DoesNotExist: pass
@@
-1406,12
+1420,12
@@
class Search(IndexStore):
return tags
return tags
- def search_books(self, query, filt
er
=None, max_results=10):
+ def search_books(self, query, filt=None, max_results=10):
"""
Searches for Book objects using query
"""
bks = []
"""
Searches for Book objects using query
"""
bks = []
- tops = self.searcher.search(query, filt
er
, max_results)
+ tops = self.searcher.search(query, filt, max_results)
for found in tops.scoreDocs:
doc = self.searcher.doc(found.doc)
try:
for found in tops.scoreDocs:
doc = self.searcher.doc(found.doc)
try: