fnp
/
wolnelektury.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
some fixes to oaipmh: namespaces
[wolnelektury.git]
/
apps
/
search
/
index.py
diff --git
a/apps/search/index.py
b/apps/search/index.py
index
9d6d598
..
a0bf715
100644
(file)
--- a/
apps/search/index.py
+++ b/
apps/search/index.py
@@
-18,7
+18,7
@@
from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader,
# KeywordAnalyzer
# Initialize jvm
# KeywordAnalyzer
# Initialize jvm
-JVM = initVM(CLASSPATH)
+JVM = initVM(CLASSPATH
, maxheap=settings.JVM_MAXHEAP
)
import sys
import os
import sys
import os
@@
-31,9
+31,11
@@
import catalogue.models
from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
from multiprocessing.pool import ThreadPool
from threading import current_thread
from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
from multiprocessing.pool import ThreadPool
from threading import current_thread
+from itertools import chain
import atexit
import traceback
import atexit
import traceback
-
+import logging
+log = logging.getLogger('search')
class WLAnalyzer(PerFieldAnalyzerWrapper):
def __init__(self):
class WLAnalyzer(PerFieldAnalyzerWrapper):
def __init__(self):
@@
-147,7
+149,6
@@
class Snippets(object):
if not os.path.exists(self.path):
break
self.revision += 1
if not os.path.exists(self.path):
break
self.revision += 1
- print "using %s" % self.path
self.file = open(self.path, mode)
self.position = 0
self.file = open(self.path, mode)
self.position = 0
@@
-218,7
+219,7
@@
class BaseIndex(IndexStore):
try:
self.index.optimize()
except JavaError, je:
try:
self.index.optimize()
except JavaError, je:
- print "Error during optimize phase, check index: %s" % je
+ log.error("Error during optimize phase, check index: %s" % je)
self.index.close()
self.index = None
self.index.close()
self.index = None
@@
-277,9
+278,9
@@
class Index(BaseIndex):
if not remove_only:
# then add them [all or just one passed]
if not tags:
if not remove_only:
# then add them [all or just one passed]
if not tags:
- tags = c
atalogue.models.Tag.objects.exclude(category='set') +
\
- PDCounterAuthor.objects.all()
+
\
- PDCounterBook.objects.all()
+ tags = c
hain(catalogue.models.Tag.objects.exclude(category='set'),
\
+ PDCounterAuthor.objects.all()
,
\
+ PDCounterBook.objects.all()
)
for tag in tags:
if isinstance(tag, PDCounterAuthor):
for tag in tags:
if isinstance(tag, PDCounterAuthor):
@@
-316,14
+317,19
@@
class Index(BaseIndex):
doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
return doc
doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
return doc
- def remove_book(self, book, remove_snippets=True):
+ def remove_book(self, book
_or_id
, remove_snippets=True):
"""Removes a book from search index.
book - Book instance."""
"""Removes a book from search index.
book - Book instance."""
- q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
+ if isinstance(book_or_id, catalogue.models.Book):
+ book_id = book_or_id.id
+ else:
+ book_id = book_or_id
+
+ q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True)
self.index.deleteDocuments(q)
if remove_snippets:
self.index.deleteDocuments(q)
if remove_snippets:
- snippets = Snippets(book
.
id)
+ snippets = Snippets(book
_
id)
snippets.remove()
def index_book(self, book, book_info=None, overwrite=True):
snippets.remove()
def index_book(self, book, book_info=None, overwrite=True):
@@
-338,7
+344,11
@@
class Index(BaseIndex):
self.remove_book(book, remove_snippets=False)
book_doc = self.create_book_doc(book)
self.remove_book(book, remove_snippets=False)
book_doc = self.create_book_doc(book)
- meta_fields = self.extract_metadata(book, book_info)
+ meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
+ # let's not index it - it's only used for extracting publish date
+ if 'source_name' in meta_fields:
+ del meta_fields['source_name']
+
for f in meta_fields.values():
if isinstance(f, list) or isinstance(f, tuple):
for elem in f:
for f in meta_fields.values():
if isinstance(f, list) or isinstance(f, tuple):
for elem in f:
@@
-372,7
+382,7
@@
class Index(BaseIndex):
published_date_re = re.compile("([0-9]+)[\]. ]*$")
published_date_re = re.compile("([0-9]+)[\]. ]*$")
- def extract_metadata(self, book, book_info=None):
+ def extract_metadata(self, book, book_info=None
, dc_only=None
):
"""
Extract metadata from book and returns a map of fields keyed by fieldname
"""
"""
Extract metadata from book and returns a map of fields keyed by fieldname
"""
@@
-387,6
+397,8
@@
class Index(BaseIndex):
# validator, name
for field in dcparser.BookInfo.FIELDS:
# validator, name
for field in dcparser.BookInfo.FIELDS:
+ if dc_only and field.name not in dc_only:
+ continue
if hasattr(book_info, field.name):
if not getattr(book_info, field.name):
continue
if hasattr(book_info, field.name):
if not getattr(book_info, field.name):
continue
@@
-492,8
+504,6
@@
class Index(BaseIndex):
.setIntValue('header_span' in fields and fields['header_span'] or 1))
doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
.setIntValue('header_span' in fields and fields['header_span'] or 1))
doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
- print ">>[%s]>%s<<<" % (fields.get('fragment_anchor', ''), fields['content'])
-
doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
Field.TermVector.WITH_POSITIONS_OFFSETS))
doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
Field.TermVector.WITH_POSITIONS_OFFSETS))
@@
-623,7
+633,7
@@
def log_exception_wrapper(f):
try:
f(*a)
except Exception, e:
try:
f(*a)
except Exception, e:
-
print
("Error in indexing thread: %s" % e)
+
log.error
("Error in indexing thread: %s" % e)
traceback.print_exc()
raise e
return _wrap
traceback.print_exc()
raise e
return _wrap
@@
-643,7
+653,6
@@
class ReusableIndex(Index):
if ReusableIndex.index:
self.index = ReusableIndex.index
else:
if ReusableIndex.index:
self.index = ReusableIndex.index
else:
- print("opening index")
Index.open(self, analyzer, **kw)
ReusableIndex.index = self.index
atexit.register(ReusableIndex.close_reusable)
Index.open(self, analyzer, **kw)
ReusableIndex.index = self.index
atexit.register(ReusableIndex.close_reusable)
@@
-655,7
+664,6
@@
class ReusableIndex(Index):
@staticmethod
def close_reusable():
if ReusableIndex.index:
@staticmethod
def close_reusable():
if ReusableIndex.index:
- print("closing index")
ReusableIndex.index.optimize()
ReusableIndex.index.close()
ReusableIndex.index = None
ReusableIndex.index.optimize()
ReusableIndex.index.close()
ReusableIndex.index = None
@@
-808,7
+816,7
@@
class SearchResult(object):
# remove fragments with duplicated fid's and duplicated snippets
frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
# remove fragments with duplicated fid's and duplicated snippets
frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
- frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or
hash(f)
,
+ frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or
f[FRAGMENT]
,
lambda a, b: cmp(a[SCORE], b[SCORE]))
# remove duplicate sections
lambda a, b: cmp(a[SCORE], b[SCORE]))
# remove duplicate sections
@@
-874,7
+882,6
@@
class SearchResult(object):
for r in rl:
if r.book_id in books:
books[r.book_id].merge(r)
for r in rl:
if r.book_id in books:
books[r.book_id].merge(r)
- #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
else:
books[r.book_id] = r
return books.values()
else:
books[r.book_id] = r
return books.values()
@@
-1010,9
+1017,8
@@
class Search(IndexStore):
def reopen(self, **unused):
reader = self.searcher.getIndexReader()
rdr = reader.reopen()
def reopen(self, **unused):
reader = self.searcher.getIndexReader()
rdr = reader.reopen()
- print "got signal to reopen index"
if not rdr.equals(reader):
if not rdr.equals(reader):
- print "will reopen index"
+ log.debug('Reopening index')
oldsearch = self.searcher
self.searcher = IndexSearcher(rdr)
oldsearch.close()
oldsearch = self.searcher
self.searcher = IndexSearcher(rdr)
oldsearch.close()
@@
-1060,7
+1066,8
@@
class Search(IndexStore):
return toks
return toks
- def fuzziness(self, fuzzy):
+ @staticmethod
+ def fuzziness(fuzzy):
"""Helper method to sanitize fuzziness"""
if not fuzzy:
return None
"""Helper method to sanitize fuzziness"""
if not fuzzy:
return None
@@
-1081,7
+1088,6
@@
class Search(IndexStore):
fuzzterms = []
while True:
fuzzterms = []
while True:
- # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
ft = fuzzterm.term()
if ft:
fuzzterms.append(ft)
ft = fuzzterm.term()
if ft:
fuzzterms.append(ft)
@@
-1098,7
+1104,8
@@
class Search(IndexStore):
phrase.add(term)
return phrase
phrase.add(term)
return phrase
- def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
+ @staticmethod
+ def make_term_query(tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
"""
Returns term queries joined by boolean query.
modal - applies to boolean query
"""
Returns term queries joined by boolean query.
modal - applies to boolean query
@@
-1252,7
+1259,6
@@
class Search(IndexStore):
topDocs = self.searcher.search(q, only_in, max_results)
for found in topDocs.scoreDocs:
books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
topDocs = self.searcher.search(q, only_in, max_results)
for found in topDocs.scoreDocs:
books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
- print "* %s theme x content: %s" % (searched, books[-1]._hits)
# query themes/content x author/title/tags
q = BooleanQuery()
# query themes/content x author/title/tags
q = BooleanQuery()
@@
-1271,7
+1277,6
@@
class Search(IndexStore):
topDocs = self.searcher.search(q, only_in, max_results)
for found in topDocs.scoreDocs:
books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
topDocs = self.searcher.search(q, only_in, max_results)
for found in topDocs.scoreDocs:
books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
- print "* %s scatter search: %s" % (searched, books[-1]._hits)
return books
return books
@@
-1332,9
+1337,17
@@
class Search(IndexStore):
return None
revision = stored.get('snippets_revision')
if revision: revision = int(revision)
return None
revision = stored.get('snippets_revision')
if revision: revision = int(revision)
+
# locate content.
book_id = int(stored.get('book_id'))
# locate content.
book_id = int(stored.get('book_id'))
- snippets = Snippets(book_id, revision=revision).open()
+ snippets = Snippets(book_id, revision=revision)
+
+ try:
+ snippets.open()
+ except IOError, e:
+ log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
+ return []
+
try:
try:
text = snippets.get((int(position),
try:
try:
text = snippets.get((int(position),
@@
-1371,13
+1384,13
@@
class Search(IndexStore):
if terms:
return JArray('object')(terms, Term)
if terms:
return JArray('object')(terms, Term)
- def search_tags(self, query, filt
ers
=None, max_results=40, pdcounter=False):
+ def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
"""
Search for Tag objects using query.
"""
if not pdcounter:
"""
Search for Tag objects using query.
"""
if not pdcounter:
- filters = self.chain_filters([filt
er
, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
- tops = self.searcher.search(query, filt
ers
, max_results)
+ filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
+ tops = self.searcher.search(query, filt, max_results)
tags = []
for found in tops.scoreDocs:
tags = []
for found in tops.scoreDocs:
@@
-1396,22
+1409,23
@@
class Search(IndexStore):
else:
tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
# don't add the pdcounter tag if same tag already exists
else:
tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
# don't add the pdcounter tag if same tag already exists
- if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
- tags.append(tag)
+
+ tags.append(tag)
+
except catalogue.models.Tag.DoesNotExist: pass
except PDCounterAuthor.DoesNotExist: pass
except PDCounterBook.DoesNotExist: pass
except catalogue.models.Tag.DoesNotExist: pass
except PDCounterAuthor.DoesNotExist: pass
except PDCounterBook.DoesNotExist: pass
-
# print "%s (%d) -> %f" % (tag, tag.id, found.score
)
- print 'returning %s' % tags
+
log.debug('search_tags: %s' % tags
)
+
return tags
return tags
- def search_books(self, query, filt
er
=None, max_results=10):
+ def search_books(self, query, filt=None, max_results=10):
"""
Searches for Book objects using query
"""
bks = []
"""
Searches for Book objects using query
"""
bks = []
- tops = self.searcher.search(query, filt
er
, max_results)
+ tops = self.searcher.search(query, filt, max_results)
for found in tops.scoreDocs:
doc = self.searcher.doc(found.doc)
try:
for found in tops.scoreDocs:
doc = self.searcher.doc(found.doc)
try: