from sunburnt import search
import copy
from httplib2 import socket
+import re
+
class TermVectorOptions(search.Options):
def __init__(self, schema, original=None):
self.init_schema()
except socket.error, e:
raise socket.error, "Cannot connect to Solr server, and search indexing is enabled (%s)" % str(e)
-
def _analyze(self, **kwargs):
if not self.readable:
terms = map(lambda n: unicode(n.text), terms)
return terms
+ def expand_margins(self, text, start, end):
+ totlen = len(text)
+
+ def is_boundary(x):
+ ws = re.compile(r"\W", re.UNICODE)
+ return bool(ws.match(x))
+
+ while start > 0:
+ if is_boundary(text[start - 1]):
+ break
+ start -= 1
+
+ while end < totlen:
+ if is_boundary(text[end + 1]):
+ break
+ end += 1
+
+ return (start, end)
+
def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
start = None
end = None
((s, e),
(max(0, s - margins), min(totlen, e + margins))),
matches)
+ matches_margins = map(lambda (m, (s, e)):
+ (m, self.expand_margins(text, s, e)),
+ matches_margins)
+
+ # lets start with first match
(start, end) = matches_margins[0][1]
- matches = []
+ matches = [matches_margins[0][0]]
+
for (m, (s, e)) in matches_margins[1:]:
if end < s or start > e:
continue
start = min(start, s)
end = max(end, e)
matches.append(m)
-
+
snip = text[start:end]
matches.sort(lambda a, b: cmp(b[0], a[0]))
+ print matches
for (s, e) in matches:
off = - start
snip = snip[:e + off] + mark[1] + snip[e + off:]
snip = snip[:s + off] + mark[0] + snip[s + off:]
- # maybe break on word boundaries
return snip
-
header_span = header_span is not None and int(header_span) or 1
fragment = doc.get("fragment_anchor", None)
snippets_pos = (doc['snippets_position'], doc['snippets_length'])
- snippets_rev = doc['snippets_revision']
+ snippets_rev = doc.get('snippets_revision', None)
hit = (sec + (header_span,), fragment, self._score, {
'how_found': how_found,
self._hits.append(hit)
def __unicode__(self):
- return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
+ return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
(self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
def __str__(self):
def __init__(self, default_field="text"):
super(Search, self).__init__(mode='r')
- # def get_tokens(self, searched, field='text', cached=None):
- # """returns tokens analyzed by a proper (for a field) analyzer
- # argument can be: StringReader, string/unicode, or tokens. In the last case
- # they will just be returned (so we can reuse tokens, if we don't change the analyzer)
- # """
- # if cached is not None and field in cached:
- # return cached[field]
-
- # if isinstance(searched, str) or isinstance(searched, unicode):
- # searched = StringReader(searched)
- # elif isinstance(searched, list):
- # return searched
-
- # searched.reset()
- # tokens = self.analyzer.reusableTokenStream(field, searched)
- # toks = []
- # while tokens.incrementToken():
- # cta = tokens.getAttribute(CharTermAttribute.class_)
- # toks.append(cta.toString())
-
- # if cached is not None:
- # cached[field] = toks
-
- # return toks
-
- # @staticmethod
- # def fuzziness(fuzzy):
- # """Helper method to sanitize fuzziness"""
- # if not fuzzy:
- # return None
- # if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
- # return fuzzy
- # else:
- # return 0.5
-
- # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
- # """
- # Return a PhraseQuery with a series of tokens.
- # """
- # if fuzzy:
- # phrase = MultiPhraseQuery()
- # for t in tokens:
- # term = Term(field, t)
- # fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
- # fuzzterms = []
-
- # while True:
- # ft = fuzzterm.term()
- # if ft:
- # fuzzterms.append(ft)
- # if not fuzzterm.next(): break
- # if fuzzterms:
- # phrase.add(JArray('object')(fuzzterms, Term))
- # else:
- # phrase.add(term)
- # else:
- # phrase = PhraseQuery()
- # phrase.setSlop(slop)
- # for t in tokens:
- # term = Term(field, t)
- # phrase.add(term)
- # return phrase
def make_term_query(self, query, field='text', modal=operator.or_):
"""
res = query.execute()
return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
- # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
- # """
- # Search for perfect book matches. Just see if the query matches with some author or title,
- # taking hints into account.
- # """
- # fields_to_search = ['authors', 'title']
- # only_in = None
- # if hint:
- # if not hint.should_search_for_book():
- # return []
- # fields_to_search = hint.just_search_in(fields_to_search)
- # only_in = hint.book_filter()
-
- # qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
-
- # books = []
- # for q in qrys:
- # top = self.searcher.search(q,
- # self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
- # max_results)
- # for found in top.scoreDocs:
- # books.append(SearchResult(self, found, how_found="search_perfect_book"))
- # return books
-
- # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
- # fields_to_search = ['tags', 'authors', 'title']
-
- # only_in = None
- # if hint:
- # if not hint.should_search_for_book():
- # return []
- # fields_to_search = hint.just_search_in(fields_to_search)
- # only_in = hint.book_filter()
-
- # tokens = self.get_tokens(searched, field='SIMPLE')
-
- # q = BooleanQuery()
-
- # for fld in fields_to_search:
- # q.add(BooleanClause(self.make_term_query(tokens, field=fld,
- # fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
-
- # books = []
- # top = self.searcher.search(q,
- # self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
- # max_results)
- # for found in top.scoreDocs:
- # books.append(SearchResult(self, found, how_found="search_book"))
-
- # return books
-
- # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
- # """
- # Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
- # some part/fragment of the book.
- # """
- # qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
-
- # flt = None
- # if hint:
- # flt = hint.part_filter()
-
- # books = []
- # for q in qrys:
- # top = self.searcher.search(q,
- # self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
- # flt]),
- # max_results)
- # for found in top.scoreDocs:
- # books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
-
- # return books
def search_everywhere(self, searched, query_terms=None):
"""
except catalogue.models.Book.DoesNotExist: pass
return bks
- # def make_prefix_phrase(self, toks, field):
- # q = MultiPhraseQuery()
- # for i in range(len(toks)):
- # t = Term(field, toks[i])
- # if i == len(toks) - 1:
- # pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
- # if pterms:
- # q.add(pterms)
- # else:
- # q.add(t)
- # else:
- # q.add(t)
- # return q
-
- # @staticmethod
- # def term_filter(term, inverse=False):
- # only_term = TermsFilter()
- # only_term.addTerm(term)
-
- # if inverse:
- # neg = BooleanFilter()
- # neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
- # only_term = neg
-
- # return only_term
-
-
@staticmethod
def apply_filters(query, filters):
for f in filters:
query = query.query(f)
return query
-
- # def filtered_categories(self, tags):
- # """
- # Return a list of tag categories, present in tags list.
- # """
- # cats = {}
- # for t in tags:
- # cats[t.category] = True
- # return cats.keys()
-
- # def hint(self):
- # return Hint(self)
+++ /dev/null
-
-from django.core.management.base import BaseCommand
-from search import IndexChecker
-
-class Command(BaseCommand):
- help = 'Check Lucene search index'
- args = ''
-
- def handle(self, *args, **opts):
- checker = IndexChecker()
- status = checker.check()
- if status.clean:
- print "No problems found."
- else:
- if status.missingSegments:
- print "Unable to locate."
- print "Number of bad segments: %d / %d (max segment name is %d)" % \
- (status.numBadSegments, status.numSegments, status.maxSegmentName)
- print "Total lost documents (due to bad segments) %d" % status.totLoseDocCount
- if not status.validCounter:
- print "Segment counter is not valid."
-
+++ /dev/null
-<?xml version='1.0' encoding='utf-8'?>
-<utwor>
- <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/">
-<rdf:Description rdf:about="http://wiki.wolnepodreczniki.pl/index.php?title=Lektury:S%C4%99p-Szarzy%C5%84ski/Rytmy/Fraszka_do_Anusie">
-<dc:creator xml:lang="pl">Sęp Szarzyński, Mikołaj</dc:creator>
-<dc:title xml:lang="pl">Fraszka do Anusie</dc:title>
-<dc:contributor.editor xml:lang="pl">Sekuła, Aleksandra</dc:contributor.editor>
-<dc:contributor.technical_editor xml:lang="pl">Sutkowska, Olga</dc:contributor.technical_editor>
-<dc:publisher xml:lang="pl">Fundacja Nowoczesna Polska</dc:publisher>
-<dc:subject.period xml:lang="pl">Barok</dc:subject.period>
-<dc:subject.type xml:lang="pl">Liryka</dc:subject.type>
-<dc:subject.genre xml:lang="pl">Fraszka</dc:subject.genre>
-<dc:description xml:lang="pl">Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN.</dc:description>
-<dc:identifier.url xml:lang="pl">http://wolnelektury.pl/katalog/lektura/fraszka-do-anusie</dc:identifier.url>
-<dc:source.URL xml:lang="pl">http://www.polona.pl/Content/8759</dc:source.URL>
-<dc:source xml:lang="pl">Szarzyński Sęp, Mikołaj (ca 1550-1581), Rytmy abo Wiersze polskie w wyborze, E. Wende, Warszawa, 1914</dc:source>
-<dc:rights xml:lang="pl">Domena publiczna - Mikołaj Sęp Szarzyński zm. 1581</dc:rights>
-<dc:date.pd xml:lang="pl">1581</dc:date.pd>
-<dc:format xml:lang="pl">xml</dc:format>
-<dc:type xml:lang="pl">text</dc:type>
-<dc:type xml:lang="en">text</dc:type>
-<dc:date xml:lang="pl">2008-12-29</dc:date>
-<dc:audience xml:lang="pl">L</dc:audience>
-<dc:audience xml:lang="pl">L</dc:audience>
-<dc:language xml:lang="pl">pol</dc:language>
-</rdf:Description>
-</rdf:RDF>
- <liryka_l>
-
-<autor_utworu>Mikołaj Sęp Szarzyński</autor_utworu>
-
-<nazwa_utworu>Fraszka do Anusie</nazwa_utworu>
-
-
-
-<strofa><begin id="b1230084410751"/><motyw id="m1230084410751">Kochanek, Łzy, Miłość, Oko, Serce, Wzrok</motyw>Jeśli oczu hamować swoich nie umiały/
-Leśnych krynic boginie, aby nie płakały,/
-Gdy baczyły<pe><slowo_obce>baczyły</slowo_obce> --- tu: zobaczyły, patrzyły na.</pe> przy studni Narcyza pięknego,/
-A on umarł prze miłość oblicza swojego;/
-Jeśli nieśmiertelnym stanom żałość rozkazuje,/
-Gdy niebaczna fortuna co niesłusznie psuje:</strofa>
-
-<strofa>Jakoż ja mam hamować, by na lice moje/
-Z oczu smutnych żałośne nie płynęły zdroje?/
-Jako serce powściągać, aby nie wzdychało/
-I od ciężkiej żałości omdlewać nie miało?<end id="e1230084410751"/></strofa>
-
-</liryka_l>
-</utwor>
+++ /dev/null
-<?xml version='1.0' encoding='utf-8'?>
-<utwor>
-<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/">
-<rdf:Description rdf:about="">
-<dc:creator xml:lang="pl">Kochanowski, Jan</dc:creator>
-<dc:title xml:lang="pl">Fraszki</dc:title>
-<dc:relation.hasPart xml:lang="pl">http://wolnelektury.pl/katalog/lektura/fraszka-do-anusie</dc:relation.hasPart>
-
-<dc:publisher xml:lang="pl">Fundacja Nowoczesna Polska</dc:publisher>
-<dc:subject.period xml:lang="pl">Renesans</dc:subject.period>
-<dc:subject.type xml:lang="pl">Liryka</dc:subject.type>
-<dc:subject.genre xml:lang="pl">Fraszka</dc:subject.genre>
-
-<dc:description xml:lang="pl"></dc:description>
-<dc:identifier.url xml:lang="pl">http://wolnelektury.pl/lektura/fraszki</dc:identifier.url>
-<dc:source xml:lang="pl"></dc:source>
-<dc:rights xml:lang="pl">Domena publiczna - Jan Kochanowski zm. 1584</dc:rights>
-<dc:date.pd xml:lang="pl">1584</dc:date.pd>
-<dc:format xml:lang="pl">xml</dc:format>
-<dc:type xml:lang="pl">text</dc:type>
-
-<dc:type xml:lang="en">text</dc:type>
-<dc:date xml:lang="pl">2008-11-12</dc:date>
-<dc:language xml:lang="pl">pol</dc:language>
-</rdf:Description>
-</rdf:RDF>
-</utwor>
# -*- coding: utf-8 -*-
from django.conf import settings
from django.test.utils import override_settings
-from catalogue.test_utils import WLTestCase
-from lucene import PolishAnalyzer, Version
+from catalogue.test_utils import WLTestCase, get_fixture
from os import path
import tempfile
-from catalogue import models
-from search import Search, SearchResult
+from catalogue.models import Book, Tag
+from search import Index, Search, SearchResult
+import catalogue
+import opds
@override_settings(
def setUp(self):
WLTestCase.setUp(self)
- txt = path.join(path.dirname(__file__), 'files/fraszka-do-anusie.xml')
+ index = Index()
+ index.index.delete_all()
+ index.index.commit()
+
with self.settings(NO_SEARCH_INDEX=False):
- self.book = models.Book.from_xml_file(txt)
+ self.do_doktora = Book.from_xml_file(
+ get_fixture('do-doktora.xml', opds))
+ self.do_anusie = Book.from_xml_file(
+ get_fixture('fraszka-do-anusie.xml', catalogue))
+
self.search = Search()
def test_search_perfect_book_author(self):
- books = self.search.search_perfect_book("sęp szarzyński")
+ books = self.search.search_books(self.search.index.query(authors=u"sęp szarzyński"))
assert len(books) == 1
assert books[0].book_id == self.book.id
def test_search_perfect_book_title(self):
- books = self.search.search_perfect_book("fraszka anusie")
+ books = self.search.search_books(self.search.index.query(u"fraszka anusie"))
assert len(books) == 1
assert books[0].book_id == self.book.id
def test_search_perfect_parts(self):
- books = self.search.search_perfect_parts("Jakoż hamować")
+ books = self.search.search_phrase(u"Jakoż hamować")
assert len(books) == 2
for b in books:
b.book_id == self.book.id
assert len(a[0].hits) == 1
def test_search_perfect_author_title(self):
- books = self.search.search_perfect_book("szarzyński anusie")
+ books = self.search.search_books(self.search.index.query(authors=u"szarzyński anusie"))
assert books == []
- books = self.search.search_book("szarzyński anusie")
- assert len(books) == 1
-
- books = self.search.search_book("szarzyński fraszka")
- assert len(books) == 1
-
- def test_search_everywhere(self):
- books = self.search.search_everywhere("szarzyński kochanek")
- print 'szarzyński kochanek %s' % [b.hits for b in books]
-
- books = self.search.search_everywhere("szarzyński narcyz")
- print 'szarzyński narcyz %s' % [b.hits for b in books]
-
- books = self.search.search_everywhere("anusie narcyz")
- print 'anusie narcyz %s' % [b.hits for b in books]
-
- # theme content cross
- books = self.search.search_everywhere("wzrok boginie")
- print 'wzrok boginie %s' % [b.hits for b in books]
-
- books = self.search.search_everywhere("anusie płynęły zdroje")
- print 'anusie płynęły zdroje %s' % [b.hits for b in books]
+
# ensure books do exists & sort them
results.sort(reverse=True)
- if len(results) == 1:
- fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
- if len(fragment_hits) == 1:
- #anchor = fragment_hits[0]['fragment']
- #frag = Fragment.objects.get(anchor=anchor)
- return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
- return HttpResponseRedirect(results[0].book.get_absolute_url())
- elif len(results) == 0:
+ # We don't want to redirect to book text, but rather display result page even with one result.
+ # if len(results) == 1:
+ # fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
+ # if len(fragment_hits) == 1:
+ # #anchor = fragment_hits[0]['fragment']
+ # #frag = Fragment.objects.get(anchor=anchor)
+ # return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
+ # return HttpResponseRedirect(results[0].book.get_absolute_url())
+ if len(results) == 0:
form = PublishingSuggestForm(initial={"books": query + ", "})
return render_to_response('catalogue/search_no_hits.html',
{'tags': tags,