from librarian import dcparser
from librarian.parser import WLDocument
from lxml import etree
import catalogue.models
from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
from itertools import chain
from librarian import dcparser
from librarian.parser import WLDocument
from lxml import etree
import catalogue.models
from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
from itertools import chain
class SolrIndex(object):
def __init__(self, mode=None):
self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
class SolrIndex(object):
def __init__(self, mode=None):
self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
- if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
- else: fn = "%d" % self.book_id
+ if self.revision:
+ fn = "%d.%d" % (self.book_id, self.revision)
+ else:
+ fn = "%d" % self.book_id
- tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
- PDCounterAuthor.objects.all(), \
+ tags = chain(
+ catalogue.models.Tag.objects.exclude(category='set'),
+ PDCounterAuthor.objects.all(),
footnote_tags = ['pa', 'pt', 'pr', 'pe']
footnote_tags = ['pa', 'pt', 'pr', 'pe']
- skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
+ skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
+ '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
"""
Walks the book XML and extract content from it.
Adds parts for each header tag and for each fragment.
"""
Walks the book XML and extract content from it.
Adds parts for each header tag and for each fragment.
- # separator = [u" ", u"\t", u".", u";", u","]
+ # separator = [u" ", u"\t", u".", u";", u","]
if isinstance(text, list):
# need to join it first
text = filter(lambda s: s is not None, content)
if isinstance(text, list):
# need to join it first
text = filter(lambda s: s is not None, content)
# handle fragments and themes.
if start is not None and start.tag == 'begin':
fid = start.attrib['id'][1:]
# handle fragments and themes.
if start is not None and start.tag == 'begin':
fid = start.attrib['id'][1:]
- fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
+ fragments[fid] = {
+ 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
# themes for this fragment
elif start is not None and start.tag == 'motyw':
fid = start.attrib['id'][1:]
# themes for this fragment
elif start is not None and start.tag == 'motyw':
fid = start.attrib['id'][1:]
if start.text is not None:
fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
elif end is not None and end.tag == 'motyw':
if start.text is not None:
fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
elif end is not None and end.tag == 'motyw':
# in the end, add a section text.
doc = add_part(snippets, header_index=position,
# in the end, add a section text.
doc = add_part(snippets, header_index=position,
- (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
+ (self.book_id, len(self._hits),
+ len(self._processed_hits) if self._processed_hits else -1,
+ self._score, len(self.snippets))
- lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
- and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
- frags)), sect)
-
- hits = []
+ lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
+ f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
def snippet_revision(self, idx=0):
try:
return self.hits[idx]['snippets_revision']
def snippet_revision(self, idx=0):
try:
return self.hits[idx]['snippets_revision']
def __init__(self, default_field="text"):
super(Search, self).__init__(mode='r')
def __init__(self, default_field="text"):
super(Search, self).__init__(mode='r')
def make_term_query(self, query, field='text', modal=operator.or_):
"""
Returns term queries joined by boolean query.
modal - applies to boolean query
fuzzy - should the query by fuzzy.
"""
def make_term_query(self, query, field='text', modal=operator.or_):
"""
Returns term queries joined by boolean query.
modal - applies to boolean query
fuzzy - should the query by fuzzy.
"""
- q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
- query.split(r" ")), q)
+ q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
return q
def search_phrase(self, searched, field='text', book=False,
filters=None,
snippets=False):
return q
def search_phrase(self, searched, field='text', book=False,
filters=None,
snippets=False):
q = self.index.query(**{field: searched})
q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
q = self.index.query(**{field: searched})
q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
def search_some(self, searched, fields, book=True,
filters=None, snippets=True, query_terms=None):
assert isinstance(fields, list)
def search_some(self, searched, fields, book=True,
filters=None, snippets=True, query_terms=None):
assert isinstance(fields, list)
def search_everywhere(self, searched, query_terms=None):
"""
Tries to use search terms to match different fields of book (or its parts).
def search_everywhere(self, searched, query_terms=None):
"""
Tries to use search terms to match different fields of book (or its parts).
- log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
+ book = catalogue.models.Book.objects.filter(id=book_id)
+ if not book:
+ log.error("Book does not exist for book id = %d" % book_id)
+ elif not book.get().children.exists():
+ log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
if not pdcounter:
filters.append(~self.index.Q(is_pdcounter=True))
res = self.apply_filters(query, filters).execute()
if not pdcounter:
filters.append(~self.index.Q(is_pdcounter=True))
res = self.apply_filters(query, filters).execute()
is_pdcounter = doc.get('is_pdcounter', False)
category = doc.get('tag_category')
try:
is_pdcounter = doc.get('is_pdcounter', False)
category = doc.get('tag_category')
try:
if category == 'pd_author':
tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
elif category == 'pd_book':
tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
tag.category = 'pd_book' # make it look more lik a tag.
else:
if category == 'pd_author':
tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
elif category == 'pd_book':
tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
tag.category = 'pd_book' # make it look more lik a tag.
else:
- print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)).encode('utf-8')
+ # WTF
+ print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (
+ int(doc.get('tag_id')), category)).encode('utf-8')
- except catalogue.models.Tag.DoesNotExist: pass
- except PDCounterAuthor.DoesNotExist: pass
- except PDCounterBook.DoesNotExist: pass
+ except catalogue.models.Tag.DoesNotExist:
+ pass
+ except PDCounterAuthor.DoesNotExist:
+ pass
+ except PDCounterBook.DoesNotExist:
+ pass