{% block right-column %}
<div class="snippets">
{% for hit in hits %}
- {% if hit.snippets %}
- <div class="snippet-text"><a href="{% url book_text book.slug %}#sec{{hit.section_number}}">{{hit.snippets.0|safe}}</a></div>
+ {% if hit.snippet %}
+ <div class="snippet-text"><a href="{% url book_text book.slug %}#sec{{hit.section_number}}">{{hit.snippet|safe}}</a></div>
{% else %}
{% if hit.fragment %}
<div class="snippet-text">
from basicauth import logged_in_or_basicauth, factory_decorator
from catalogue.models import Book, Tag
-from search.views import get_search, SearchResult, JVM
+from search.views import Search, SearchResult
from lucene import Term, QueryWrapperFilter, TermQuery
import logging
{u"href": reverse("opds_authors"),
u"rel": u"start",
u"type": u"application/atom+xml"})
- handler.addQuickElement(u"link", None,
+ handler.addQuickElement(u"link", None,
{u"href": full_url(os.path.join(settings.STATIC_URL, "opensearch.xml")),
u"rel": u"search",
u"type": u"application/opensearchdescription+xml"})
title = u"Wyniki wyszukiwania"
INLINE_QUERY_RE = re.compile(r"(author:(?P<author>[^ ]+)|title:(?P<title>[^ ]+)|categories:(?P<categories>[^ ]+)|description:(?P<description>[^ ]+))")
-
+
def get_object(self, request):
"""
For OPDS 1.1 We should handle a query for search terms
OpenSearch defines fields: atom:author, atom:contributor (treated as translator),
atom:title. Inline query provides author, title, categories (treated as book tags),
description (treated as content search terms).
-
+
if search terms are provided, we shall search for books
according to Hint information (from author & contributror & title).
(perhaps for is_book=True)
"""
- JVM.attachCurrentThread()
query = request.GET.get('q', '')
-
+
inline_criteria = re.findall(self.INLINE_QUERY_RE, query)
if inline_criteria:
def get_criteria(criteria, name, position):
translator = request.GET.get('translator', '')
# Our client didn't handle the opds placeholders
- if author == '{atom:author}': author = ''
+ if author == '{atom:author}': author = ''
if title == '{atom:title}': title = ''
if translator == '{atom:contributor}': translator = ''
categories = None
fuzzy = False
- srch = get_search()
+ srch = Search()
hint = srch.hint()
# Scenario 1: full search terms provided.
filters = []
if author:
- log.info( "narrow to author %s" % author)
- hint.tags(srch.search_tags(srch.make_phrase(srch.get_tokens(author, field='authors'), field='authors'),
+ log.info("narrow to author %s" % author)
+ hint.tags(srch.search_tags(srch.make_phrase(srch.get_tokens(author, field='authors'), field='authors'),
filt=srch.term_filter(Term('tag_category', 'author'))))
if translator:
- log.info( "filter by translator %s" % translator)
+ log.info("filter by translator %s" % translator)
filters.append(QueryWrapperFilter(
srch.make_phrase(srch.get_tokens(translator, field='translators'),
field='translators')))
flt = srch.chain_filters(filters)
if title:
- log.info( "hint by book title %s" % title)
+ log.info("hint by book title %s" % title)
q = srch.make_phrase(srch.get_tokens(title, field='title'), field='title')
hint.books(*srch.search_books(q, filt=flt))
toks = srch.get_tokens(query)
log.info("tokens for query: %s" % toks)
-
+
results = SearchResult.aggregate(srch.search_perfect_book(toks, fuzzy=fuzzy, hint=hint),
srch.search_perfect_parts(toks, fuzzy=fuzzy, hint=hint),
srch.search_everywhere(toks, fuzzy=fuzzy, hint=hint))
--- /dev/null
+
+from sunburnt import sunburnt
+from lxml import etree
+import urllib
+import warnings
+from sunburnt import search
+import copy
+
+
+class TermVectorOptions(search.Options):
+ option_name = "tv"
+
+ def __init__(self, schema, original=None):
+ self.schema = schema
+ if original is None:
+ self.fields = set()
+ self.positions = False
+ else:
+ self.fields = copy.copy(original.fields)
+ self.positions = copy.copy(original.positions)
+
+ def update(self, positions=False, fields=None):
+ if fields is None:
+ fields = []
+ if isinstance(fields, basestring):
+ fields = [fields]
+ self.schema.check_fields(fields, {"stored": True})
+ self.fields.update(fields)
+ self.positions = positions
+
+ def options(self):
+ opts = {}
+ opts['tv'] = 'true'
+ if self.positions:
+ opts['tv.positions'] = 'true'
+ if self.fields:
+ opts['tv.fl'] = ','.join(sorted(self.fields))
+ return opts
+
+
+class CustomSolrConnection(sunburnt.SolrConnection):
+ def __init__(self, *args, **kw):
+ super(CustomSolrConnection, self).__init__(*args, **kw)
+ self.analysis_url = self.url + "analysis/field/"
+
+ def analyze(self, params):
+ qs = urllib.urlencode(params)
+ url = "%s?%s" % (self.analysis_url, qs)
+ if len(url) > self.max_length_get_url:
+ warnings.warn("Long query URL encountered - POSTing instead of "
+ "GETting. This query will not be cached at the HTTP layer")
+ url = self.analysis_url
+ kwargs = dict(
+ method="POST",
+ body=qs,
+ headers={"Content-Type": "application/x-www-form-urlencoded"},
+ )
+ else:
+ kwargs = dict(method="GET")
+ r, c = self.request(url, **kwargs)
+ if r.status != 200:
+ raise sunburnt.SolrError(r, c)
+ return c
+
+
+# monkey patching sunburnt SolrSearch
+search.SolrSearch.option_modules += ('term_vectorer',)
+
+
+def __term_vector(self, positions=False, fields=None):
+ newself = self.clone()
+ newself.term_vectorer.update(positions, fields)
+ return newself
+setattr(search.SolrSearch, 'term_vector', __term_vector)
+__original__init_common_modules = search.SolrSearch._init_common_modules
+
+
+def __patched__init_common_modules(self):
+ __original__init_common_modules(self)
+ self.term_vectorer = TermVectorOptions(self.schema)
+setattr(search.SolrSearch, '_init_common_modules', __patched__init_common_modules)
+
+
+class CustomSolrInterface(sunburnt.SolrInterface):
+ # just copied from parent and SolrConnection -> CustomSolrConnection
+ def __init__(self, url, schemadoc=None, http_connection=None, mode='', retry_timeout=-1, max_length_get_url=sunburnt.MAX_LENGTH_GET_URL):
+ self.conn = CustomSolrConnection(url, http_connection, retry_timeout, max_length_get_url)
+ self.schemadoc = schemadoc
+ if mode == 'r':
+ self.writeable = False
+ elif mode == 'w':
+ self.readable = False
+ self.init_schema()
+
+ def _analyze(self, **kwargs):
+ if not self.readable:
+ raise TypeError("This Solr instance is only for writing")
+ args = {
+ 'analysis_showmatch': True
+ }
+ if 'field' in kwargs: args['analysis_fieldname'] = kwargs['field']
+ if 'text' in kwargs: args['analysis_fieldvalue'] = kwargs['text']
+ if 'q' in kwargs: args['q'] = kwargs['q']
+ if 'query' in kwargs: args['q'] = kwargs['q']
+
+ params = map(lambda (k, v): (k.replace('_', '.'), v), sunburnt.params_from_dict(**args))
+
+ content = self.conn.analyze(params)
+ doc = etree.fromstring(content)
+ return doc
+
+ def highlight(self, **kwargs):
+ doc = self._analyze(**kwargs)
+ analyzed = doc.xpath("//lst[@name='index']/arr[last()]/lst[bool/@name='match']")
+ matches = set()
+ for wrd in analyzed:
+ start = int(wrd.xpath("int[@name='start']")[0].text)
+ end = int(wrd.xpath("int[@name='end']")[0].text)
+ matches.add((start, end))
+
+ print matches
+ if matches:
+ return self.substring(kwargs['text'], matches,
+ margins=kwargs.get('margins', 30),
+ mark=kwargs.get('mark', ("<b>", "</b>")))
+ else:
+ return None
+
+ def analyze(self, **kwargs):
+ doc = self._analyze(self, **kwargs)
+ terms = doc.xpath("/lst[@name='index']/arr[last()]/lst/str[1]")
+ terms = map(lambda n: unicode(n.text), terms)
+ return terms
+
+ def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
+ start = None
+ end = None
+ totlen = len(text)
+ matches_margins = map(lambda (s, e): (max(0, s - margins), min(totlen, e + margins)), matches)
+ (start, end) = matches_margins[0]
+
+ for (s, e) in matches_margins[1:]:
+ if end < s or start > e:
+ continue
+ start = min(start, s)
+ end = max(end, e)
+
+ snip = text[start:end]
+ matches = list(matches)
+ matches.sort(lambda a, b: cmp(b[0], a[0]))
+ for (s, e) in matches:
+ off = - start
+ snip = snip[:e + off] + mark[1] + snip[e + off:]
+ snip = snip[:s + off] + mark[0] + snip[s + off:]
+ # maybe break on word boundaries
+ return snip
+++ /dev/null
-
-from sunburnt import sunburnt
-from lxml import etree
-import urllib
-import warnings
-
-
-class HLSolrConnection(sunburnt.SolrConnection):
- def __init__(self, *args, **kw):
- super(HLSolrConnection, self).__init__(*args, **kw)
- self.analysis_url = self.url + "analysis/field/"
-
- def highlight(self, params):
- qs = urllib.urlencode(params)
- url = "%s?%s" % (self.analysis_url, qs)
- if len(url) > self.max_length_get_url:
- warnings.warn("Long query URL encountered - POSTing instead of "
- "GETting. This query will not be cached at the HTTP layer")
- url = self.analysis_url
- kwargs = dict(
- method="POST",
- body=qs,
- headers={"Content-Type": "application/x-www-form-urlencoded"},
- )
- else:
- kwargs = dict(method="GET")
- r, c = self.request(url, **kwargs)
- if r.status != 200:
- raise sunburnt.SolrError(r, c)
- return c
-
-
-class HLSolrInterface(sunburnt.SolrInterface):
- # just copied from parent and SolrConnection -> HLSolrConnection
- def __init__(self, url, schemadoc=None, http_connection=None, mode='', retry_timeout=-1, max_length_get_url=sunburnt.MAX_LENGTH_GET_URL):
- self.conn = HLSolrConnection(url, http_connection, retry_timeout, max_length_get_url)
- self.schemadoc = schemadoc
- if mode == 'r':
- self.writeable = False
- elif mode == 'w':
- self.readable = False
- self.init_schema()
-
- def highlight(self, **kwargs):
- if not self.readable:
- raise TypeError("This Solr instance is only for writing")
- args = {
- 'analysis_fieldname': kwargs['field'],
- 'analysis_showmatch': True,
- 'analysis_fieldvalue': kwargs['text'],
- 'q': kwargs['q']
- }
- params = map(lambda (k, v): (k.replace('_', '.'), v), sunburnt.params_from_dict(**args))
-
- content = self.conn.highlight(params)
- doc = etree.fromstring(content)
- analyzed = doc.xpath("//lst[@name='index']/arr[last()]/lst[bool/@name='match']")
- matches = set()
- for wrd in analyzed:
- start = int(wrd.xpath("int[@name='start']")[0].text)
- end = int(wrd.xpath("int[@name='end']")[0].text)
- matches.add((start, end))
-
- print matches
- return self.substring(kwargs['text'], matches,
- margins=kwargs.get('margins', 30),
- mark=kwargs.get('mark', ("<b>", "</b>")))
-
- def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
- start = None
- end = None
- totlen = len(text)
- matches_margins = map(lambda (s, e): (max(0, s - margins), min(totlen, e + margins)), matches)
- (start, end) = matches_margins[0]
-
- for (s, e) in matches_margins[1:]:
- if end < s or start > e:
- continue
- start = min(start, s)
- end = max(end, e)
-
- snip = text[start:end]
- matches = list(matches)
- matches.sort(lambda a, b: cmp(b[0], a[0]))
- for (s, e) in matches:
- off = - start
- snip = text[:e + off] + mark[1] + snip[e + off:]
- snip = text[:s + off] + mark[0] + snip[s + off:]
- # maybe break on word boundaries
- return snip
import logging
log = logging.getLogger('search')
import sunburnt
-import highlight
+import custom
+import operator
class SolrIndex(object):
def __init__(self, mode=None):
- self.index = highlight.HLSolrInterface(settings.SOLR, mode=mode)
+ self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
class Snippets(object):
break
for res in ids:
uids.add(res['uid'])
- st+=rows
+ st += rows
# print "Will delete %s" % ','.join([x for x in uids])
if uids:
self.index.delete(uids)
"tag_name": tag.name,
"tag_name_pl": tag.name,
"tag_category": 'pd_author',
- "is_pdcounter": True
+ "is_pdcounter": True,
+ "uid": "tag%d_pd_a" % tag.id
}
elif isinstance(tag, PDCounterBook):
doc = {
"tag_name": tag.title,
"tag_name_pl": tag.title,
"tag_category": 'pd_book',
- "is_pdcounter": True
+ "is_pdcounter": True,
+ "uid": "tag%d_pd_b" % tag.id
}
else:
doc = {
"tag_name": tag.name,
"tag_name_pl": tag.name,
"tag_category": tag.category,
- "is_pdcounter": False
+ "is_pdcounter": False,
+ "uid": "tag%d" % tag.id
}
- doc['uid'] = "tag%d" % tag.id
self.index.add(doc)
+ print "%s %s" % (doc['tag_name'], doc['tag_category'])
def create_book_doc(self, book):
"""
book_doc['uid'] = "book%s" % book_doc['book_id']
self.index.add(book_doc)
del book_doc
-
- self.index_content(book, book_fields={
+ book_fields = {
'title': meta_fields['title'],
'authors': meta_fields['authors'],
- 'published_date': meta_fields['published_date']})
+ 'published_date': meta_fields['published_date']
+ }
+ if 'translators' in meta_fields:
+ book_fields['translators'] = meta_fields['translators']
+
+ self.index_content(book, book_fields=book_fields)
master_tags = [
'opowiadanie',
doc['themes'] = fields['themes']
doc['uid'] = "part%s%s%s" % (doc['header_index'],
doc['header_span'],
- doc.get('fragment_anchor',''))
+ doc.get('fragment_anchor', ''))
return doc
def give_me_utf8(s):
snippets.close()
-
class SearchResult(object):
- def __init__(self, search, doc, how_found=None, snippets=None, searched=None, tokens_cache=None):
- if tokens_cache is None: tokens_cache = {}
+ def __init__(self, doc, how_found=None, query=None):
+ # self.search = search
+ self.boost = 1.0
+ self._hits = []
+ self._processed_hits = None # processed hits
+ self.snippets = []
if 'score' in doc:
self._score = doc['score']
else:
self._score = 0
- self.boost = 1.0
-
- self._hits = []
- self._processed_hits = None # processed hits
-
self.book_id = int(doc["book_id"])
- pd = doc["published_date"]
try:
- self.published_date = int(pd)
+ self.published_date = int(doc.get("published_date"))
except ValueError:
self.published_date = 0
+ # content hits
header_type = doc.get("header_type", None)
# we have a content hit in some header of fragment
if header_type is not None:
sec = (header_type, int(doc["header_index"]))
header_span = doc['header_span']
header_span = header_span is not None and int(header_span) or 1
-
fragment = doc.get("fragment_anchor", None)
+ snippets_pos = (doc['snippets_position'], doc['snippets_length'])
+ snippets_rev = doc['snippets_revision']
- if snippets:
- snippets = snippets.replace("/\n", "\n")
- hit = (sec + (header_span,), fragment, self._score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
+ hit = (sec + (header_span,), fragment, self._score, {
+ 'how_found': how_found,
+ 'snippets_pos': snippets_pos,
+ 'snippets_revision': snippets_rev
+ })
self._hits.append(hit)
- self.search = search
- self.searched = searched
- self.tokens_cache = tokens_cache
+ def __unicode__(self):
+ return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
+ (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
+
+ def __str__(self):
+ return unicode(self).encode('utf-8')
@property
def score(self):
def get_book(self):
if hasattr(self, '_book'):
return self._book
- return catalogue.models.Book.objects.get(id=self.book_id)
+ self._book = catalogue.models.Book.objects.get(id=self.book_id)
+ return self._book
book = property(get_book)
+ POSITION = 0
+ FRAGMENT = 1
+ POSITION_INDEX = 1
+ POSITION_SPAN = 2
+ SCORE = 2
+ OTHER = 3
+
@property
def hits(self):
if self._processed_hits is not None:
return self._processed_hits
- POSITION = 0
- FRAGMENT = 1
- POSITION_INDEX = 1
- POSITION_SPAN = 2
- SCORE = 2
- OTHER = 3
-
# to sections and fragments
- frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
+ frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
- sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
+ sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
# sections not covered by fragments
sect = filter(lambda s: 0 == len(filter(
- lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
- and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
+ lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
+ and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
frags)), sect)
hits = []
return els.values()
# remove fragments with duplicated fid's and duplicated snippets
- frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
- frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
- lambda a, b: cmp(a[SCORE], b[SCORE]))
+ frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
+ # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
+ # lambda a, b: cmp(a[SCORE], b[SCORE]))
# remove duplicate sections
sections = {}
for s in sect:
- si = s[POSITION][POSITION_INDEX]
+ si = s[self.POSITION][self.POSITION_INDEX]
# skip existing
if si in sections:
- if sections[si]['score'] >= s[SCORE]:
+ if sections[si]['score'] >= s[self.SCORE]:
continue
- m = {'score': s[SCORE],
- 'section_number': s[POSITION][POSITION_INDEX] + 1,
+ m = {'score': s[self.SCORE],
+ 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
}
- m.update(s[OTHER])
+ m.update(s[self.OTHER])
sections[si] = m
hits = sections.values()
for f in frags:
try:
- frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
+ frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
except catalogue.models.Fragment.DoesNotExist:
# stale index
continue
# Figure out if we were searching for a token matching some word in theme name.
themes = frag.tags.filter(category='theme')
themes_hit = []
- if self.searched is not None:
- tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
- for theme in themes:
- name_tokens = self.search.get_tokens(theme.name, 'POLISH')
- for t in tokens:
- if t in name_tokens:
- if not theme in themes_hit:
- themes_hit.append(theme)
- break
-
- m = {'score': f[SCORE],
+ # if self.searched is not None:
+ # tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
+ # for theme in themes:
+ # name_tokens = self.search.get_tokens(theme.name, 'POLISH')
+ # for t in tokens:
+ # if t in name_tokens:
+ # if not theme in themes_hit:
+ # themes_hit.append(theme)
+ # break
+
+ m = {'score': f[self.SCORE],
'fragment': frag,
- 'section_number': f[POSITION][POSITION_INDEX] + 1,
+ 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
'themes': themes,
'themes_hit': themes_hit
}
- m.update(f[OTHER])
+ m.update(f[self.OTHER])
hits.append(m)
hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
return hits
- def __unicode__(self):
- return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
-
@staticmethod
def aggregate(*result_lists):
books = {}
else:
return c
+ def __len__(self):
+ return len(self.hits)
-class Hint(object):
- """
- Given some hint information (information we already know about)
- our search target - like author, title (specific book), epoch, genre, kind
- we can narrow down search using filters.
- """
- def __init__(self, search):
- """
- Accepts a Searcher instance.
- """
- self.search = search
- self.book_tags = {}
- self.part_tags = []
- self._books = []
+ def snippet_pos(self, idx=0):
+ return self.hits[idx]['snippets_pos']
- def books(self, *books):
- """
- Give a hint that we search these books.
- """
- self._books = books
-
- def tags(self, tags):
- """
- Give a hint that these Tag objects (a list of)
- is necessary.
- """
- for t in tags:
- if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
- lst = self.book_tags.get(t.category, [])
- lst.append(t)
- self.book_tags[t.category] = lst
- if t.category in ['theme', 'theme_pl']:
- self.part_tags.append(t)
-
- def tag_filter(self, tags, field='tags'):
- """
- Given a lsit of tags and an optional field (but they are normally in tags field)
- returns a filter accepting only books with specific tags.
- """
- q = BooleanQuery()
-
- for tag in tags:
- toks = self.search.get_tokens(tag.name, field=field)
- tag_phrase = PhraseQuery()
- for tok in toks:
- tag_phrase.add(Term(field, tok))
- q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
-
- return QueryWrapperFilter(q)
-
- def book_filter(self):
- """
- Filters using book tags (all tag kinds except a theme)
- """
- tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
- if tags:
- return self.tag_filter(tags)
- else:
+ def snippet_revision(self, idx=0):
+ try:
+ return self.hits[idx]['snippets_revision']
+ except:
return None
- def part_filter(self):
- """
- This filter can be used to look for book parts.
- It filters on book id and/or themes.
- """
- fs = []
- if self.part_tags:
- fs.append(self.tag_filter(self.part_tags, field='themes'))
-
- if self._books != []:
- bf = BooleanFilter()
- for b in self._books:
- id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
- bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
- fs.append(bf)
-
- return Search.chain_filters(fs)
-
- def should_search_for_book(self):
- return self._books == []
-
- def just_search_in(self, all):
- """Holds logic to figure out which indexes should be search, when we have some hinst already"""
- some = []
- for field in all:
- if field == 'authors' and 'author' in self.book_tags:
- continue
- if field == 'title' and self._books != []:
- continue
- if (field == 'themes' or field == 'themes_pl') and self.part_tags:
- continue
- some.append(field)
- return some
-
class Search(SolrIndex):
"""
Search facilities.
"""
def __init__(self, default_field="text"):
- IndexStore.__init__(self)
- self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
- # self.analyzer = WLAnalyzer()
- reader = IndexReader.open(self.store, True)
- self.searcher = IndexSearcher(reader)
- self.parser = QueryParser(Version.LUCENE_34, default_field,
- self.analyzer)
-
- self.parent_filter = TermsFilter()
- self.parent_filter.addTerm(Term("is_book", "true"))
- index_changed.connect(self.reopen)
-
- def close(self):
- reader = self.searcher.getIndexReader()
- self.searcher.close()
- reader.close()
- super(Search, self).close()
- index_changed.disconnect(self.reopen)
-
- def reopen(self, **unused):
- reader = self.searcher.getIndexReader()
- rdr = reader.reopen()
- if not rdr.equals(reader):
- log.debug('Reopening index')
- oldsearch = self.searcher
- self.searcher = IndexSearcher(rdr)
- oldsearch.close()
- reader.close()
-
- def query(self, query):
- """Parse query in default Lucene Syntax. (for humans)
- """
- return self.parser.parse(query)
-
- def simple_search(self, query, max_results=50):
- """Runs a query for books using lucene syntax. (for humans)
- Returns (books, total_hits)
- """
-
- tops = self.searcher.search(self.query(query), max_results)
- bks = []
- for found in tops.scoreDocs:
- doc = self.searcher.doc(found.doc)
- bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
- return (bks, tops.totalHits)
-
- def get_tokens(self, searched, field='text', cached=None):
- """returns tokens analyzed by a proper (for a field) analyzer
- argument can be: StringReader, string/unicode, or tokens. In the last case
- they will just be returned (so we can reuse tokens, if we don't change the analyzer)
- """
- if cached is not None and field in cached:
- return cached[field]
+ super(Search, self).__init__()
- if isinstance(searched, str) or isinstance(searched, unicode):
- searched = StringReader(searched)
- elif isinstance(searched, list):
- return searched
-
- searched.reset()
- tokens = self.analyzer.reusableTokenStream(field, searched)
- toks = []
- while tokens.incrementToken():
- cta = tokens.getAttribute(CharTermAttribute.class_)
- toks.append(cta.toString())
-
- if cached is not None:
- cached[field] = toks
-
- return toks
-
- @staticmethod
- def fuzziness(fuzzy):
- """Helper method to sanitize fuzziness"""
- if not fuzzy:
- return None
- if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
- return fuzzy
- else:
- return 0.5
-
- def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
- """
- Return a PhraseQuery with a series of tokens.
- """
- if fuzzy:
- phrase = MultiPhraseQuery()
- for t in tokens:
- term = Term(field, t)
- fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
- fuzzterms = []
-
- while True:
- ft = fuzzterm.term()
- if ft:
- fuzzterms.append(ft)
- if not fuzzterm.next(): break
- if fuzzterms:
- phrase.add(JArray('object')(fuzzterms, Term))
- else:
- phrase.add(term)
- else:
- phrase = PhraseQuery()
- phrase.setSlop(slop)
- for t in tokens:
- term = Term(field, t)
- phrase.add(term)
- return phrase
-
- @staticmethod
- def make_term_query(tokens, field='text', modal='BooleanClause.Occur.SHOULD XXX', fuzzy=False):
+ # def get_tokens(self, searched, field='text', cached=None):
+ # """returns tokens analyzed by a proper (for a field) analyzer
+ # argument can be: StringReader, string/unicode, or tokens. In the last case
+ # they will just be returned (so we can reuse tokens, if we don't change the analyzer)
+ # """
+ # if cached is not None and field in cached:
+ # return cached[field]
+
+ # if isinstance(searched, str) or isinstance(searched, unicode):
+ # searched = StringReader(searched)
+ # elif isinstance(searched, list):
+ # return searched
+
+ # searched.reset()
+ # tokens = self.analyzer.reusableTokenStream(field, searched)
+ # toks = []
+ # while tokens.incrementToken():
+ # cta = tokens.getAttribute(CharTermAttribute.class_)
+ # toks.append(cta.toString())
+
+ # if cached is not None:
+ # cached[field] = toks
+
+ # return toks
+
+ # @staticmethod
+ # def fuzziness(fuzzy):
+ # """Helper method to sanitize fuzziness"""
+ # if not fuzzy:
+ # return None
+ # if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
+ # return fuzzy
+ # else:
+ # return 0.5
+
+ # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
+ # """
+ # Return a PhraseQuery with a series of tokens.
+ # """
+ # if fuzzy:
+ # phrase = MultiPhraseQuery()
+ # for t in tokens:
+ # term = Term(field, t)
+ # fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
+ # fuzzterms = []
+
+ # while True:
+ # ft = fuzzterm.term()
+ # if ft:
+ # fuzzterms.append(ft)
+ # if not fuzzterm.next(): break
+ # if fuzzterms:
+ # phrase.add(JArray('object')(fuzzterms, Term))
+ # else:
+ # phrase.add(term)
+ # else:
+ # phrase = PhraseQuery()
+ # phrase.setSlop(slop)
+ # for t in tokens:
+ # term = Term(field, t)
+ # phrase.add(term)
+ # return phrase
+
+ def make_term_query(self, query, field='text', modal=operator.or_):
"""
Returns term queries joined by boolean query.
modal - applies to boolean query
fuzzy - should the query by fuzzy.
"""
- q = BooleanQuery()
- for t in tokens:
- term = Term(field, t)
- if fuzzy:
- term = FuzzyQuery(term, self.fuzziness(fuzzy))
- else:
- term = TermQuery(term)
- q.add(BooleanClause(term, modal))
+ q = self.index.Q()
+ q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
+ query.split(r" ")), q)
+
return q
- def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
- filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
+ def search_phrase(self, searched, field='text', book=False,
+ filters=None,
+ snippets=False):
if filters is None: filters = []
- if tokens_cache is None: tokens_cache = {}
-
- tokens = self.get_tokens(searched, field, cached=tokens_cache)
-
- query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
- if book:
- filters.append(self.term_filter(Term('is_book', 'true')))
- top = self.searcher.search(query, self.chain_filters(filters), max_results)
+ if book: filters.append(self.index.Q(is_book=True))
- return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
+ q = self.index.query(**{field: searched})
+ q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
+ res = q.execute()
+ return [SearchResult(found, how_found=u'search_phrase') for found in res]
- def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
- filters=None, tokens_cache=None, boost=None, snippets=True):
+ def search_some(self, searched, fields, book=True,
+ filters=None,
+ snippets=True):
+ assert isinstance(fields, list)
if filters is None: filters = []
- if tokens_cache is None: tokens_cache = {}
+ if book: filters.append(self.index.Q(is_book=True))
- if book:
- filters.append(self.term_filter(Term('is_book', 'true')))
-
- query = BooleanQuery()
+ query = self.index.Q()
for fld in fields:
- tokens = self.get_tokens(searched, fld, cached=tokens_cache)
-
- query.add(BooleanClause(self.make_term_query(tokens, field=fld,
- fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
-
- top = self.searcher.search(query, self.chain_filters(filters), max_results)
-
- return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
- snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
-
- def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
- """
- Search for perfect book matches. Just see if the query matches with some author or title,
- taking hints into account.
- """
- fields_to_search = ['authors', 'title']
- only_in = None
- if hint:
- if not hint.should_search_for_book():
- return []
- fields_to_search = hint.just_search_in(fields_to_search)
- only_in = hint.book_filter()
-
- qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
-
- books = []
- for q in qrys:
- top = self.searcher.search(q,
- self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
- max_results)
- for found in top.scoreDocs:
- books.append(SearchResult(self, found, how_found="search_perfect_book"))
- return books
-
- def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
- fields_to_search = ['tags', 'authors', 'title']
-
- only_in = None
- if hint:
- if not hint.should_search_for_book():
- return []
- fields_to_search = hint.just_search_in(fields_to_search)
- only_in = hint.book_filter()
-
- tokens = self.get_tokens(searched, field='SIMPLE')
-
- q = BooleanQuery()
-
- for fld in fields_to_search:
- q.add(BooleanClause(self.make_term_query(tokens, field=fld,
- fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+ query = self.index.Q(query | self.make_term_query(searched, fld))
- books = []
- top = self.searcher.search(q,
- self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
- max_results)
- for found in top.scoreDocs:
- books.append(SearchResult(self, found, how_found="search_book"))
-
- return books
+ query = self.index.query(query)
+ query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
+ res = query.execute()
+ return [SearchResult(found, how_found='search_some') for found in res]
- def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
- """
- Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
- some part/fragment of the book.
- """
- qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
+ # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
+ # """
+ # Search for perfect book matches. Just see if the query matches with some author or title,
+ # taking hints into account.
+ # """
+ # fields_to_search = ['authors', 'title']
+ # only_in = None
+ # if hint:
+ # if not hint.should_search_for_book():
+ # return []
+ # fields_to_search = hint.just_search_in(fields_to_search)
+ # only_in = hint.book_filter()
+
+ # qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
+
+ # books = []
+ # for q in qrys:
+ # top = self.searcher.search(q,
+ # self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
+ # max_results)
+ # for found in top.scoreDocs:
+ # books.append(SearchResult(self, found, how_found="search_perfect_book"))
+ # return books
+
+ # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
+ # fields_to_search = ['tags', 'authors', 'title']
+
+ # only_in = None
+ # if hint:
+ # if not hint.should_search_for_book():
+ # return []
+ # fields_to_search = hint.just_search_in(fields_to_search)
+ # only_in = hint.book_filter()
+
+ # tokens = self.get_tokens(searched, field='SIMPLE')
+
+ # q = BooleanQuery()
+
+ # for fld in fields_to_search:
+ # q.add(BooleanClause(self.make_term_query(tokens, field=fld,
+ # fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+
+ # books = []
+ # top = self.searcher.search(q,
+ # self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
+ # max_results)
+ # for found in top.scoreDocs:
+ # books.append(SearchResult(self, found, how_found="search_book"))
+
+ # return books
+
+ # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
+ # """
+ # Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
+ # some part/fragment of the book.
+ # """
+ # qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
- flt = None
- if hint:
- flt = hint.part_filter()
+ # flt = None
+ # if hint:
+ # flt = hint.part_filter()
- books = []
- for q in qrys:
- top = self.searcher.search(q,
- self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
- flt]),
- max_results)
- for found in top.scoreDocs:
- books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
+ # books = []
+ # for q in qrys:
+ # top = self.searcher.search(q,
+ # self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
+ # flt]),
+ # max_results)
+ # for found in top.scoreDocs:
+ # books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
- return books
+ # return books
- def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
+ def search_everywhere(self, searched):
"""
Tries to use search terms to match different fields of book (or its parts).
E.g. one word can be an author survey, another be a part of the title, and the rest
are some words from third chapter.
"""
- if tokens_cache is None: tokens_cache = {}
books = []
- only_in = None
-
- if hint:
- only_in = hint.part_filter()
-
# content only query : themes x content
- q = BooleanQuery()
-
- tokens_pl = self.get_tokens(searched, field='text', cached=tokens_cache)
- tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
- # only search in themes when we do not already filter by themes
- if hint is None or hint.just_search_in(['themes']) != []:
- q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
- fuzzy=fuzzy), BooleanClause.Occur.MUST))
+ q = self.make_term_query(searched, 'text')
+ q_themes = self.make_term_query(searched, 'themes_pl')
- q.add(BooleanClause(self.make_term_query(tokens_pl, field='text',
- fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
+ query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
+ res = query.execute()
- topDocs = self.searcher.search(q, only_in, max_results)
- for found in topDocs.scoreDocs:
- books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
+ for found in res:
+ books.append(SearchResult(found, how_found='search_everywhere_themesXcontent'))
# query themes/content x author/title/tags
- q = BooleanQuery()
- in_content = BooleanQuery()
- in_meta = BooleanQuery()
+ in_content = self.index.Q()
+ in_meta = self.index.Q()
for fld in ['themes_pl', 'text']:
- in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
+ in_content |= self.make_term_query(searched, field=fld)
for fld in ['tags', 'authors', 'title']:
- in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
-
- q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
- q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
+ in_meta |= self.make_term_query(searched, field=fld)
- topDocs = self.searcher.search(q, only_in, max_results)
- for found in topDocs.scoreDocs:
- books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
+ q = in_content & in_meta
+ res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
+ for found in res:
+ books.append(SearchResult(found, how_found='search_everywhere'))
return books
- # def multisearch(self, query, max_results=50):
- # """
- # Search strategy:
- # - (phrase) OR -> content
- # -> title
- # -> authors
- # - (keywords) -> authors
- # -> motyw
- # -> tags
- # -> content
- # """
- # queryreader = StringReader(query)
- # tokens = self.get_tokens(queryreader)
-
- # top_level = BooleanQuery()
- # Should = BooleanClause.Occur.SHOULD
-
- # phrase_level = BooleanQuery()
- # phrase_level.setBoost(1.3)
-
- # p_content = self.make_phrase(tokens, joined=True)
- # p_title = self.make_phrase(tokens, 'title')
- # p_author = self.make_phrase(tokens, 'author')
-
- # phrase_level.add(BooleanClause(p_content, Should))
- # phrase_level.add(BooleanClause(p_title, Should))
- # phrase_level.add(BooleanClause(p_author, Should))
-
- # kw_level = BooleanQuery()
-
- # kw_level.add(self.make_term_query(tokens, 'author'), Should)
- # j_themes = self.make_term_query(tokens, 'themes', joined=True)
- # kw_level.add(j_themes, Should)
- # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
- # j_con = self.make_term_query(tokens, joined=True)
- # kw_level.add(j_con, Should)
-
- # top_level.add(BooleanClause(phrase_level, Should))
- # top_level.add(BooleanClause(kw_level, Should))
-
- # return None
-
- def get_snippets(self, scoreDoc, query, field='text'):
+ def get_snippets(self, searchresult, query, field='text', num=1):
"""
Returns a snippet for found scoreDoc.
"""
- htmlFormatter = SimpleHTMLFormatter()
- highlighter = Highlighter(htmlFormatter, QueryScorer(query))
-
- stored = self.searcher.doc(scoreDoc.doc)
-
- position = stored.get('snippets_position')
- length = stored.get('snippets_length')
- if position is None or length is None:
- return None
- revision = stored.get('snippets_revision')
- if revision: revision = int(revision)
-
- # locate content.
- book_id = int(stored.get('book_id'))
+ maxnum = len(searchresult)
+ if num is None or num < 0 or num > maxnum:
+ num = maxnum
+ book_id = searchresult.book_id
+ revision = searchresult.snippet_revision()
snippets = Snippets(book_id, revision=revision)
-
+ snips = [None] * maxnum
try:
snippets.open()
+ idx = 0
+ while idx < maxnum and num > 0:
+ position, length = searchresult.snippet_pos(idx)
+ if position is None or length is None:
+ continue
+ text = snippets.get((int(position),
+ int(length)))
+ print "== %s -- %s ==" % (query, text)
+ snip = self.index.highlight(text=text, field=field, q=query)
+ snips[idx] = snip
+ if snip:
+ num -= 1
+ idx += 1
+
except IOError, e:
log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
return []
+ finally:
+ snippets.close()
- try:
- try:
- text = snippets.get((int(position),
- int(length)))
- finally:
- snippets.close()
-
- tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
- # highlighter.getBestTextFragments(tokenStream, text, False, 10)
- snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
+ # remove verse end markers..
+ snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
- except Exception, e:
- e2 = e
- if hasattr(e, 'getJavaException'):
- e2 = unicode(e.getJavaException())
- raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
- e2)
- return snip
+ searchresult.snippets = snips
+ return snips
- @staticmethod
- def enum_to_array(enum):
+ def hint_tags(self, query, pdcounter=True, prefix=True):
"""
- Converts a lucene TermEnum to array of Terms, suitable for
- addition to queries
+ Return auto-complete hints for tags
+ using prefix search.
"""
- terms = []
-
- while True:
- t = enum.term()
- if t:
- terms.append(t)
- if not enum.next(): break
+ q = self.index.Q()
+ query = query.strip()
+ for field in ['tag_name', 'tag_name_pl']:
+ if prefix:
+ q |= self.index.Q(**{field: query + "*"})
+ else:
+ q |= self.make_term_query(query, field=field)
+ qu = self.index.query(q).exclude(tag_category="book")
- if terms:
- return JArray('object')(terms, Term)
+ return self.search_tags(qu, pdcounter=pdcounter)
- def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
+ def search_tags(self, query, filters=None, pdcounter=False):
"""
Search for Tag objects using query.
"""
+ if not filters: filters = []
if not pdcounter:
- filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
- tops = self.searcher.search(query, filt, max_results)
+ filters.append(~self.index.Q(is_pdcounter=True))
+ res = self.apply_filters(query, filters).execute()
tags = []
- for found in tops.scoreDocs:
- doc = self.searcher.doc(found.doc)
- is_pdcounter = doc.get('is_pdcounter')
+ for doc in res:
+ is_pdcounter = doc.get('is_pdcounter', False)
category = doc.get('tag_category')
try:
- if is_pdcounter == 'true':
+ if is_pdcounter == True:
if category == 'pd_author':
tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
elif category == 'pd_book':
return tags
- def search_books(self, query, filt=None, max_results=10):
+ def hint_books(self, query, prefix=True):
+ """
+ Returns auto-complete hints for book titles
+ Because we do not index 'pseudo' title-tags.
+ Prefix search.
+ """
+ q = self.index.Q()
+ query = query.strip()
+ if prefix:
+ q |= self.index.Q(title=query + "*")
+ else:
+ q |= self.make_term_query(query, field='title')
+ qu = self.index.query(q)
+ only_books = self.index.Q(is_book=True)
+ return self.search_books(qu, [only_books])
+
+ def search_books(self, query, filters=None, max_results=10):
"""
Searches for Book objects using query
"""
bks = []
- tops = self.searcher.search(query, filt, max_results)
- for found in tops.scoreDocs:
- doc = self.searcher.doc(found.doc)
+ res = self.apply_filters(query, filters).field_limit(['book_id'])
+ for r in res:
try:
- bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
+ bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
except catalogue.models.Book.DoesNotExist: pass
return bks
+
+ # def make_prefix_phrase(self, toks, field):
+ # q = MultiPhraseQuery()
+ # for i in range(len(toks)):
+ # t = Term(field, toks[i])
+ # if i == len(toks) - 1:
+ # pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
+ # if pterms:
+ # q.add(pterms)
+ # else:
+ # q.add(t)
+ # else:
+ # q.add(t)
+ # return q
+
+ # @staticmethod
+ # def term_filter(term, inverse=False):
+ # only_term = TermsFilter()
+ # only_term.addTerm(term)
+
+ # if inverse:
+ # neg = BooleanFilter()
+ # neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
+ # only_term = neg
+
+ # return only_term
- def make_prefix_phrase(self, toks, field):
- q = MultiPhraseQuery()
- for i in range(len(toks)):
- t = Term(field, toks[i])
- if i == len(toks) - 1:
- pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
- if pterms:
- q.add(pterms)
- else:
- q.add(t)
- else:
- q.add(t)
- return q
-
- @staticmethod
- def term_filter(term, inverse=False):
- only_term = TermsFilter()
- only_term.addTerm(term)
-
- if inverse:
- neg = BooleanFilter()
- neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
- only_term = neg
- return only_term
-
- def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
- """
- Return auto-complete hints for tags
- using prefix search.
- """
- toks = self.get_tokens(string, field='SIMPLE')
- top = BooleanQuery()
-
- for field in ['tag_name', 'tag_name_pl']:
- if prefix:
- q = self.make_prefix_phrase(toks, field)
- else:
- q = self.make_term_query(toks, field, fuzzy=fuzzy)
- top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
-
- no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
-
- return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
-
- def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
- """
- Returns auto-complete hints for book titles
- Because we do not index 'pseudo' title-tags.
- Prefix search.
- """
- toks = self.get_tokens(string, field='SIMPLE')
-
- if prefix:
- q = self.make_prefix_phrase(toks, 'title')
- else:
- q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
-
- return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
@staticmethod
- def chain_filters(filters, op='XXXChainedFilter.AND'):
+ def apply_filters(query, filters):
"""
- Chains a filter list together
+ Apply filters to a query
"""
+ if filters is None: filters = []
filters = filter(lambda x: x is not None, filters)
- if not filters or filters is []:
- return None
- chf = ChainedFilter(JArray('object')(filters, Filter), op)
- return chf
+ for f in filters:
+ query = query.query(f)
+ return query
- def filtered_categories(self, tags):
- """
- Return a list of tag categories, present in tags list.
- """
- cats = {}
- for t in tags:
- cats[t.category] = True
- return cats.keys()
+ # def filtered_categories(self, tags):
+ # """
+ # Return a list of tag categories, present in tags list.
+ # """
+ # cats = {}
+ # for t in tags:
+ # cats[t.category] = True
+ # return cats.keys()
- def hint(self):
- return Hint(self)
+ # def hint(self):
+ # return Hint(self)
# We don't need hits which lead to sections but do not have
# snippets.
- hits = filter(lambda h: 'fragment' in h or
- h['snippets'], result.hits)[0:5]
-
- for hit in hits:
- hit['snippets'] = map(lambda s: s.replace("\n", "<br />").replace('---', '—'), hit['snippets'])
+ hits = filter(lambda (idx, h):
+ result.snippets[idx] is not None
+ or 'fragment' in h, enumerate(result.hits))
+ print "[tmpl: from %d hits selected %d]" % (len(result.hits), len(hits))
+
+ for (idx, hit) in hits:
+ # currently we generate one snipper per hit though.
+ if 'fragment' in hit:
+ continue
+ snip = result.snippets[idx]
+ # fix some formattting
+ snip = snip.replace("\n", "<br />").replace('---', '—')
+ hit['snippet'] = snip
return {
'related': book.related_info(),
'book': book,
'main_link': book.get_absolute_url(),
'request': context.get('request'),
- 'hits': hits,
+ 'hits': zip(*hits)[1],
'main_link': book.get_absolute_url(),
}
-
from catalogue.models import Book, Tag, Fragment
from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
from catalogue.views import JSONResponse
-from search import Search, JVM, SearchResult
+from search import Search, SearchResult
from lucene import StringReader
from suggest.forms import PublishingSuggestForm
from time import sleep
#import enchant
import json
-#dictionary = enchant.Dict('en_US')
-
def match_word_re(word):
if 'sqlite' in settings.DATABASES['default']['ENGINE']:
def did_you_mean(query, tokens):
- change = {}
- for t in tokens:
- authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
- if len(authors) > 0:
- continue
-
- if False:
- if not dictionary.check(t):
- try:
- change_to = dictionary.suggest(t)[0].lower()
- if change_to != t.lower():
- change[t] = change_to
- except IndexError:
- pass
-
- if change == {}:
- return None
-
- for frm, to in change.items():
- query = query.replace(frm, to)
-
return query
+ # change = {}
+ # for t in tokens:
+ # authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
+ # if len(authors) > 0:
+ # continue
+ # if False:
+ # if not dictionary.check(t):
+ # try:
+ # change_to = dictionary.suggest(t)[0].lower()
+ # if change_to != t.lower():
+ # change[t] = change_to
+ # except IndexError:
+ # pass
-JVM.attachCurrentThread()
-_search = None
-
+ # if change == {}:
+ # return None
-def get_search():
- global _search
+ # for frm, to in change.items():
+ # query = query.replace(frm, to)
- while _search is False:
- sleep(1)
-
- if _search is None:
- _search = False
- _search = Search()
- return _search
+ # return query
def hint(request):
prefix = request.GET.get('term', '')
if len(prefix) < 2:
return JSONResponse([])
- JVM.attachCurrentThread()
-
- search = get_search()
- hint = search.hint()
- try:
- tags = request.GET.get('tags', '')
- hint.tags(Tag.get_tag_list(tags))
- except:
- pass
+ search = Search()
# tagi beda ograniczac tutaj
# ale tagi moga byc na ksiazce i na fragmentach
# jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce
tags = search.hint_tags(prefix, pdcounter=True)
books = search.hint_books(prefix)
-
def is_dupe(tag):
if isinstance(tag, PDCounterAuthor):
if filter(lambda t: t.slug == tag.slug and t != tag, tags):
content_type="application/json; charset=utf-8")
else:
return JSONResponse(data)
-
def main(request):
results = {}
- JVM.attachCurrentThread() # where to put this?
results = None
query = None
- fuzzy = False #0.8
- query = request.GET.get('q','')
- # book_id = request.GET.get('book', None)
- # book = None
- # if book_id is not None:
- # book = get_object_or_404(Book, id=book_id)
-
- # hint = search.hint()
- # try:
- # tag_list = Tag.get_tag_list(tags)
- # except:
- # tag_list = []
+ query = request.GET.get('q', '')
if len(query) < 2:
- return render_to_response('catalogue/search_too_short.html', {'prefix': query},
- context_instance=RequestContext(request))
-
- search = get_search()
- # hint.tags(tag_list)
- # if book:
- # hint.books(book)
- tags = search.hint_tags(query, pdcounter=True, prefix=False, fuzzy=fuzzy)
- tags = split_tags(tags)
+ return render_to_response('catalogue/search_too_short.html',
+ {'prefix': query},
+ context_instance=RequestContext(request))
+ search = Search()
- toks = StringReader(query)
- tokens_cache = {}
+ # change hints
+ tags = search.hint_tags(query, pdcounter=True, prefix=False)
+ tags = split_tags(tags)
- author_results = search.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache)
- title_results = search.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache)
+ author_results = search.search_phrase(query, 'authors', book=True)
+ title_results = search.search_phrase(query, 'title', book=True)
# Boost main author/title results with mixed search, and save some of its results for end of list.
# boost author, title results
- author_title_mixed = search.search_some(toks, ['authors', 'title', 'tags'], fuzzy=fuzzy, tokens_cache=tokens_cache)
+ author_title_mixed = search.search_some(query, ['authors', 'title', 'tags'])
author_title_rest = []
+
for b in author_title_mixed:
- bks = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
- for b2 in bks:
+ also_in_mixed = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results)
+ for b2 in also_in_mixed:
b2.boost *= 1.1
- if bks is []:
+ if also_in_mixed is []:
author_title_rest.append(b)
# Do a phrase search but a term search as well - this can give us better snippets then search_everywhere,
# Because the query is using only one field.
text_phrase = SearchResult.aggregate(
- search.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False, slop=4),
- search.search_some(toks, ['content'], tokens_cache=tokens_cache, snippets=True, book=False))
+ search.search_phrase(query, 'text', snippets=True, book=False),
+ search.search_some(query, ['text'], snippets=True, book=False))
- everywhere = search.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache)
+ everywhere = search.search_everywhere(query)
def already_found(results):
def f(e):
everywhere = SearchResult.aggregate(everywhere, author_title_rest)
- for res in [author_results, title_results, text_phrase, everywhere]:
+ for field, res in [('authors', author_results),
+ ('title', title_results),
+ ('text', text_phrase),
+ ('text', everywhere)]:
res.sort(reverse=True)
+ print "get snips %s, res size %d" % (field, len(res))
for r in res:
- for h in r.hits:
- h['snippets'] = map(lambda s:
- re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"",
- re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
+ print "Get snippets for %s" % r
+ search.get_snippets(r, query, field, 3)
+ # for r in res:
+ # for h in r.hits:
+ # h['snippets'] = map(lambda s:
+ # re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"",
+ # re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
- suggestion = did_you_mean(query, search.get_tokens(toks, field="SIMPLE"))
+ # suggestion = did_you_mean(query, search.get_tokens(toks, field="SIMPLE"))
+ suggestion = u''
def ensure_exists(r):
try:
return render_to_response('catalogue/search_multiple_hits.html',
{'tags': tags,
'prefix': query,
- 'results': { 'author': author_results,
- 'title': title_results,
- 'content': text_phrase,
- 'other': everywhere},
+ 'results': {'author': author_results,
+ 'title': title_results,
+ 'content': text_phrase,
+ 'other': everywhere},
'did_you_mean': suggestion},
context_instance=RequestContext(request))
--- /dev/null
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!--
+ This is the Solr schema file. This file should be named "schema.xml" and
+ should be in the conf directory under the solr home
+ (i.e. ./solr/conf/schema.xml by default)
+ or located where the classloader for the Solr webapp can find it.
+
+ This example schema is the recommended starting point for users.
+ It should be kept correct and concise, usable out-of-the-box.
+
+ For more information, on how to customize this file, please see
+ http://wiki.apache.org/solr/SchemaXml
+
+ PERFORMANCE NOTE: this schema includes many optional features and should not
+ be used for benchmarking. To improve performance one could
+ - set stored="false" for all fields possible (esp large fields) when you
+ only need to search on the field but don't need to return the original
+ value.
+ - set indexed="false" if you don't need to search on the field, but only
+ return the field as a result of searching on other indexed fields.
+ - remove all unneeded copyField statements
+ - for best index size and searching performance, set "index" to false
+ for all general text fields, use copyField to copy them to the
+ catchall "text" field, and use that for searching.
+ - For maximum indexing performance, use the StreamingUpdateSolrServer
+ java client.
+ - Remember to run the JVM in server mode, and use a higher logging level
+ that avoids logging every request
+-->
+
+<schema name="example" version="1.5">
+ <!-- attribute "name" is the name of this schema and is only used for display purposes.
+ version="x.y" is Solr's version number for the schema syntax and semantics. It should
+ not normally be changed by applications.
+ 1.0: multiValued attribute did not exist, all fields are multiValued by nature
+ 1.1: multiValued attribute introduced, false by default
+ 1.2: omitTermFreqAndPositions attribute introduced, true by default except for text fields.
+ 1.3: removed optional field compress feature
+ 1.4: default auto-phrase (QueryParser feature) to off
+ 1.5: omitNorms defaults to true for primitive field types (int, float, boolean, string...)
+ -->
+
+ <fields>
+ <!-- Valid attributes for fields:
+ name: mandatory - the name for the field
+ type: mandatory - the name of a field type from the
+ <types> fieldType section
+ indexed: true if this field should be indexed (searchable or sortable)
+ stored: true if this field should be retrievable
+ multiValued: true if this field may contain multiple values per document
+ omitNorms: (expert) set to true to omit the norms associated with
+ this field (this disables length normalization and index-time
+ boosting for the field, and saves some memory). Only full-text
+ fields or fields that need an index-time boost need norms.
+ Norms are omitted for primitive (non-analyzed) types by default.
+ termVectors: [false] set to true to store the term vector for a
+ given field.
+ When using MoreLikeThis, fields used for similarity should be
+ stored for best performance.
+ termPositions: Store position information with the term vector.
+ This will increase storage costs.
+ termOffsets: Store offset information with the term vector. This
+ will increase storage costs.
+ required: The field is required. It will throw an error if the
+ value does not exist
+ default: a value that should be used if no value is specified
+ when adding a document.
+ -->
+
+ <!-- field names should consist of alphanumeric or underscore characters only and
+ not start with a digit. This is not currently strictly enforced,
+ but other field names will not have first class support from all components
+ and back compatibility is not guaranteed. Names with both leading and
+ trailing underscores (e.g. _version_) are reserved.
+ -->
+
+ <!-- <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" /> -->
+ <!-- <field name="sku" type="text_en_splitting_tight" indexed="true" stored="true" omitNorms="true"/> -->
+ <!-- <field name="name" type="text_general" indexed="true" stored="true"/> -->
+ <!-- <field name="manu" type="text_general" indexed="true" stored="true" omitNorms="true"/> -->
+ <!-- <field name="cat" type="string" indexed="true" stored="true" multiValued="true"/> -->
+ <!-- <field name="features" type="text_general" indexed="true" stored="true" multiValued="true"/> -->
+ <!-- <field name="includes" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" /> -->
+
+ <!-- <field name="weight" type="float" indexed="true" stored="true"/> -->
+ <!-- <field name="price" type="float" indexed="true" stored="true"/> -->
+ <!-- <field name="popularity" type="int" indexed="true" stored="true" /> -->
+ <!-- <field name="inStock" type="boolean" indexed="true" stored="true" /> -->
+
+ <!-- <field name="store" type="location" indexed="true" stored="true"/> -->
+
+ <!-- Common metadata fields, named specifically to match up with
+ SolrCell metadata when parsing rich documents such as Word, PDF.
+ Some fields are multiValued only because Tika currently may return
+ multiple values for them. Some metadata is parsed from the documents,
+ but there are some which come from the client context:
+ "content_type": From the HTTP headers of incoming stream
+ "resourcename": From SolrCell request param resource.name
+ -->
+ <!-- <field name="title" type="text_general" indexed="true" stored="true" multiValued="true"/> -->
+ <!-- <field name="subject" type="text_general" indexed="true" stored="true"/> -->
+ <!-- <field name="description" type="text_general" indexed="true" stored="true"/> -->
+ <!-- <field name="comments" type="text_general" indexed="true" stored="true"/> -->
+ <!-- <field name="author" type="text_general" indexed="true" stored="true"/> -->
+ <!-- <field name="keywords" type="text_general" indexed="true" stored="true"/> -->
+ <!-- <field name="category" type="text_general" indexed="true" stored="true"/> -->
+ <!-- <field name="resourcename" type="text_general" indexed="true" stored="true"/> -->
+ <!-- <field name="url" type="text_general" indexed="true" stored="true"/> -->
+ <!-- <field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/> -->
+ <!-- <field name="last_modified" type="date" indexed="true" stored="true"/> -->
+ <!-- <field name="links" type="string" indexed="true" stored="true" multiValued="true"/> -->
+
+ <field name="book_id" type="int" indexed="true" stored="true" />
+ <field name="parent_id" type="int" indexed="false" stored="true" />
+ <field name="slug" type="text_general" stored="false" indexed="true" omitNorms="true"/> <!-- no norms -->
+ <field name="tags" type="lowercase" stored="false" indexed="true" multiValued="true"/>
+ <field name="is_book" type="boolean" stored="false" indexed="true"/>
+ <field name="authors" type="text_general" stored="false" indexed="true" multiValued="true"/>
+ <field name="translators" type="text_general" stored="false" indexed="true" multiValued="true"/>
+ <field name="title" type="text_pl" stored="false" indexed="true"/>
+ <field name="title_orig" type="text_general" stored="false" indexed="true"/>
+<!-- <field name="published_date" type="tdate" stored="false" indexed="true"/>-->
+ <field name="published_date" type="string" stored="true" indexed="true"/>
+
+ <field name="themes" type="lowercase" stored="true" intexed="true" termVectors="true" termPositions="true" multiValued="true" />
+ <field name="themes_pl" type="text_pl" stored="true" indexed="true" termVectors="true" termPositions="true" multiValued="true" />
+ <field name="header_index" type="int" stored="true" indexed="true"/>
+ <field name="header_span" type="int" stored="true" indexed="true"/>
+ <field name="header_type" type="lowercase" stored="true" indexed="false"/>
+ <field name="text" type="text_pl" stored="false" indexed="true" termPositions="true" />
+
+ <field name="snippets_position" type="int" stored="true" indexed="false"/>
+ <field name="snippets_length" type="int" stored="true" indexed="false"/>
+ <field name="snippets_revision" type="int" stored="true" indexed="false"/>
+ <field name="fragment_anchor" type="string" stored="true" indexed="false"/>
+
+ <field name="tag_id" type="int" stored="true" indexed="true"/>
+ <field name="tag_name" type="lowercase" stored="true" intexed="true" />
+ <field name="tag_name_pl" type="text_pl" stored="false" indexed="true" multiValued="true"/>
+ <field name="tag_category" type="string" stored="true" indexed="true" />
+ <field name="is_pdcounter" type="boolean" stored="true" indexed="true" />
+
+ <!-- Main body of document extracted by SolrCell.
+ NOTE: This field is not indexed by default, since it is also copied to "text"
+ using copyField below. This is to save space. Use this field for returning and
+ highlighting document content. Use the "text" field to search the content. -->
+ <!-- <field name="content" type="text_general" indexed="false" stored="true" multiValued="true"/> -->
+
+
+ <!-- catchall field, containing all other searchable text fields (implemented
+ via copyField further on in this schema -->
+ <!-- <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/> -->
+
+ <!-- catchall text field that indexes tokens both normally and in reverse for efficient
+ leading wildcard queries. -->
+ <!-- <field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/> -->
+
+ <!-- non-tokenized version of manufacturer to make it easier to sort or group
+ results by manufacturer. copied from "manu" via copyField -->
+ <!-- <field name="manu_exact" type="string" indexed="true" stored="false"/> -->
+
+ <!-- <field name="payloads" type="payloads" indexed="true" stored="true"/> -->
+
+ <!-- <field name="_version_" type="long" indexed="true" stored="true"/> -->
+
+ <!-- Uncommenting the following will create a "timestamp" field using
+ a default value of "NOW" to indicate when each document was indexed.
+ -->
+ <!--
+ <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
+ -->
+
+ <!-- Dynamic field definitions allow using convention over configuration
+ for fields via the specification of patterns to match field names.
+ EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i)
+ RESTRICTION: the glob-like pattern in the name attribute must have
+ a "*" only at the start or the end. -->
+
+ <dynamicField name="*_i" type="int" indexed="true" stored="true"/>
+ <dynamicField name="*_is" type="int" indexed="true" stored="true" multiValued="true"/>
+ <dynamicField name="*_s" type="string" indexed="true" stored="true" />
+ <dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true"/>
+ <dynamicField name="*_l" type="long" indexed="true" stored="true"/>
+ <dynamicField name="*_ls" type="long" indexed="true" stored="true" multiValued="true"/>
+ <dynamicField name="*_t" type="text_general" indexed="true" stored="true"/>
+ <dynamicField name="*_txt" type="text_general" indexed="true" stored="true" multiValued="true"/>
+ <dynamicField name="*_en" type="text_en" indexed="true" stored="true" multiValued="true"/>
+ <dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
+ <dynamicField name="*_bs" type="boolean" indexed="true" stored="true" multiValued="true"/>
+ <dynamicField name="*_f" type="float" indexed="true" stored="true"/>
+ <dynamicField name="*_fs" type="float" indexed="true" stored="true" multiValued="true"/>
+ <dynamicField name="*_d" type="double" indexed="true" stored="true"/>
+ <dynamicField name="*_ds" type="double" indexed="true" stored="true" multiValued="true"/>
+
+ <!-- Type used to index the lat and lon components for the "location" FieldType -->
+ <dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false" />
+
+ <dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
+ <dynamicField name="*_dts" type="date" indexed="true" stored="true" multiValued="true"/>
+ <dynamicField name="*_p" type="location" indexed="true" stored="true"/>
+
+ <!-- some trie-coded dynamic fields for faster range queries -->
+ <dynamicField name="*_ti" type="tint" indexed="true" stored="true"/>
+ <dynamicField name="*_tl" type="tlong" indexed="true" stored="true"/>
+ <dynamicField name="*_tf" type="tfloat" indexed="true" stored="true"/>
+ <dynamicField name="*_td" type="tdouble" indexed="true" stored="true"/>
+ <dynamicField name="*_tdt" type="tdate" indexed="true" stored="true"/>
+
+ <dynamicField name="*_pi" type="pint" indexed="true" stored="true"/>
+<!-- <dynamicField name="*_c" type="currency" indexed="true" stored="true"/>-->
+
+ <dynamicField name="ignored_*" type="ignored" multiValued="true"/>
+ <dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/>
+
+ <dynamicField name="random_*" type="random" />
+
+ <!-- uncomment the following to ignore any fields that don't already match an existing
+ field name or dynamic field, rather than reporting them as an error.
+ alternately, change the type="ignored" to some other type e.g. "text" if you want
+ unknown fields indexed and/or stored by default -->
+ <!--dynamicField name="*" type="ignored" multiValued="true" /-->
+ <field name="uid" type="string" indexed="true" stored="true"/>
+ </fields>
+
+
+ <!-- Field to use to determine and enforce document uniqueness.
+ Unless this field is marked with required="false", it will be a required field
+ -->
+ <uniqueKey>uid</uniqueKey>
+
+ <!-- DEPRECATED: The defaultSearchField is consulted by various query parsers when
+ parsing a query string that isn't explicit about the field. Machine (non-user)
+ generated queries are best made explicit, or they can use the "df" request parameter
+ which takes precedence over this.
+ Note: Un-commenting defaultSearchField will be insufficient if your request handler
+ in solrconfig.xml defines "df", which takes precedence. That would need to be removed.
+ <defaultSearchField>text</defaultSearchField> -->
+
+ <!-- DEPRECATED: The defaultOperator (AND|OR) is consulted by various query parsers
+ when parsing a query string to determine if a clause of the query should be marked as
+ required or optional, assuming the clause isn't already marked by some operator.
+ The default is OR, which is generally assumed so it is not a good idea to change it
+ globally here. The "q.op" request parameter takes precedence over this.
+ <solrQueryParser defaultOperator="OR"/> -->
+
+ <!-- copyField commands copy one field to another at the time a document
+ is added to the index. It's used either to index the same field differently,
+ or to add multiple fields to the same field for easier/faster searching. -->
+
+ <copyField source="themes" dest="themes_pl"/>
+ <copyField source="tag_name" dest="tag_name_pl"/>
+
+<!--
+ <copyField source="cat" dest="text"/>
+ <copyField source="name" dest="text"/>
+ <copyField source="manu" dest="text"/>
+ <copyField source="features" dest="text"/>
+ <copyField source="includes" dest="text"/>
+ <copyField source="manu" dest="manu_exact"/>
+-->
+ <!-- Copy the price into a currency enabled field (default USD) -->
+<!-- <copyField source="price" dest="price_c"/>-->
+
+ <!-- Text fields from SolrCell to search by default in our catch-all field -->
+<!-- <copyField source="title" dest="text"/>
+ <copyField source="author" dest="text"/>
+ <copyField source="description" dest="text"/>
+ <copyField source="keywords" dest="text"/>
+ <copyField source="content" dest="text"/>
+ <copyField source="content_type" dest="text"/>
+ <copyField source="resourcename" dest="text"/>
+ <copyField source="url" dest="text"/>-->
+
+ <!-- Create a string version of author for faceting -->
+<!-- <copyField source="author" dest="author_s"/>-->
+
+ <!-- Above, multiple source fields are copied to the [text] field.
+ Another way to map multiple source fields to the same
+ destination field is to use the dynamic field syntax.
+ copyField also supports a maxChars to copy setting. -->
+
+ <!-- <copyField source="*_t" dest="text" maxChars="3000"/> -->
+
+ <!-- copy name to alphaNameSort, a field designed for sorting by name -->
+ <!-- <copyField source="name" dest="alphaNameSort"/> -->
+
+ <types>
+ <!-- field type definitions. The "name" attribute is
+ just a label to be used by field definitions. The "class"
+ attribute and any other attributes determine the real
+ behavior of the fieldType.
+ Class names starting with "solr" refer to java classes in a
+ standard package such as org.apache.solr.analysis
+ -->
+
+ <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
+ <fieldType name="string" class="solr.StrField" sortMissingLast="true" />
+
+ <!-- boolean type: "true" or "false" -->
+ <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
+
+ <!-- sortMissingLast and sortMissingFirst attributes are optional attributes are
+ currently supported on types that are sorted internally as strings
+ and on numeric types.
+ This includes "string","boolean", and, as of 3.5 (and 4.x),
+ int, float, long, date, double, including the "Trie" variants.
+ - If sortMissingLast="true", then a sort on this field will cause documents
+ without the field to come after documents with the field,
+ regardless of the requested sort order (asc or desc).
+ - If sortMissingFirst="true", then a sort on this field will cause documents
+ without the field to come before documents with the field,
+ regardless of the requested sort order.
+ - If sortMissingLast="false" and sortMissingFirst="false" (the default),
+ then default lucene sorting will be used which places docs without the
+ field first in an ascending sort and last in a descending sort.
+ -->
+
+ <!--
+ Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
+ -->
+ <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
+ <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
+ <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
+ <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
+
+ <!--
+ Numeric field types that index each value at various levels of precision
+ to accelerate range queries when the number of values between the range
+ endpoints is large. See the javadoc for NumericRangeQuery for internal
+ implementation details.
+
+ Smaller precisionStep values (specified in bits) will lead to more tokens
+ indexed per value, slightly larger index size, and faster range queries.
+ A precisionStep of 0 disables indexing at different precision levels.
+ -->
+ <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" positionIncrementGap="0"/>
+ <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" positionIncrementGap="0"/>
+ <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" positionIncrementGap="0"/>
+ <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" positionIncrementGap="0"/>
+
+ <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
+ is a more restricted form of the canonical representation of dateTime
+ http://www.w3.org/TR/xmlschema-2/#dateTime
+ The trailing "Z" designates UTC time and is mandatory.
+ Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
+ All other components are mandatory.
+
+ Expressions can also be used to denote calculations that should be
+ performed relative to "NOW" to determine the value, ie...
+
+ NOW/HOUR
+ ... Round to the start of the current hour
+ NOW-1DAY
+ ... Exactly 1 day prior to now
+ NOW/DAY+6MONTHS+3DAYS
+ ... 6 months and 3 days in the future from the start of
+ the current day
+
+ Consult the DateField javadocs for more information.
+
+ Note: For faster range queries, consider the tdate type
+ -->
+ <fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/>
+
+ <!-- A Trie based date field for faster date range queries and date faceting. -->
+ <fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/>
+
+
+ <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
+ <fieldtype name="binary" class="solr.BinaryField"/>
+
+ <!--
+ Note:
+ These should only be used for compatibility with existing indexes (created with lucene or older Solr versions).
+ Use Trie based fields instead. As of Solr 3.5 and 4.x, Trie based fields support sortMissingFirst/Last
+
+ Plain numeric field types that store and index the text
+ value verbatim (and hence don't correctly support range queries, since the
+ lexicographic ordering isn't equal to the numeric ordering)
+ -->
+ <fieldType name="pint" class="solr.IntField"/>
+ <fieldType name="plong" class="solr.LongField"/>
+ <fieldType name="pfloat" class="solr.FloatField"/>
+ <fieldType name="pdouble" class="solr.DoubleField"/>
+ <fieldType name="pdate" class="solr.DateField" sortMissingLast="true"/>
+
+ <!-- The "RandomSortField" is not used to store or search any
+ data. You can declare fields of this type it in your schema
+ to generate pseudo-random orderings of your docs for sorting
+ or function purposes. The ordering is generated based on the field
+ name and the version of the index. As long as the index version
+ remains unchanged, and the same field name is reused,
+ the ordering of the docs will be consistent.
+ If you want different psuedo-random orderings of documents,
+ for the same version of the index, use a dynamicField and
+ change the field name in the request.
+ -->
+ <fieldType name="random" class="solr.RandomSortField" indexed="true" />
+
+ <!-- solr.TextField allows the specification of custom text analyzers
+ specified as a tokenizer and a list of token filters. Different
+ analyzers may be specified for indexing and querying.
+
+ The optional positionIncrementGap puts space between multiple fields of
+ this type on the same document, with the purpose of preventing false phrase
+ matching across fields.
+
+ For more info on customizing your analyzer chain, please see
+ http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
+ -->
+
+ <!-- One can also specify an existing Analyzer class that has a
+ default constructor via the class attribute on the analyzer element.
+ Example:
+ <fieldType name="text_greek" class="solr.TextField">
+ <analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/>
+ </fieldType>
+ -->
+
+ <fieldType name="uuid" class="solr.UUIDField" indexed="true" />
+
+
+ <!-- A text field that only splits on whitespace for exact matching of words -->
+ <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- A general text field that has reasonable, generic
+ cross-language defaults: it tokenizes with StandardTokenizer,
+ removes stop words from case-insensitive "stopwords.txt"
+ (empty by default), and down cases. At query time only, it
+ also applies synonyms. -->
+ <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- A text field with defaults appropriate for English: it
+ tokenizes with StandardTokenizer, removes English stop words
+ (lang/stopwords_en.txt), down cases, protects words from protwords.txt, and
+ finally applies Porter's stemming. The query time analyzer
+ also applies synonyms from synonyms.txt. -->
+ <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <!-- Case insensitive stop word removal.
+ add enablePositionIncrements=true in both the index and query
+ analyzers to leave a 'gap' for more accurate phrase queries.
+ -->
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="lang/stopwords_en.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.EnglishPossessiveFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ -->
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="lang/stopwords_en.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.EnglishPossessiveFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ -->
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- A text field with defaults appropriate for English, plus
+ aggressive word-splitting and autophrase features enabled.
+ This field is just like text_en, except it adds
+ WordDelimiterFilter to enable splitting and matching of
+ words on case-change, alpha numeric boundaries, and
+ non-alphanumeric chars. This means certain compound word
+ cases will work, for example query "wi fi" will match
+ document "WiFi" or "wi-fi".
+ -->
+ <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <!-- Case insensitive stop word removal.
+ add enablePositionIncrements=true in both the index and query
+ analyzers to leave a 'gap' for more accurate phrase queries.
+ -->
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="lang/stopwords_en.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="lang/stopwords_en.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Less flexible matching, but less false matches. Probably not ideal for product names,
+ but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
+ <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+ possible with WordDelimiterFilter in conjuncton with stemming. -->
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Just like text_general except it reverses the characters of
+ each token, to enable more efficient leading wildcard queries. -->
+ <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
+ maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- charFilter + WhitespaceTokenizer -->
+ <!--
+ <fieldType name="text_char_norm" class="solr.TextField" positionIncrementGap="100" >
+ <analyzer>
+ <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+ -->
+
+ <!-- This is an example of using the KeywordTokenizer along
+ With various TokenFilterFactories to produce a sortable field
+ that does not include some properties of the source text
+ -->
+ <fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
+ <analyzer>
+ <!-- KeywordTokenizer does no actual tokenizing, so the entire
+ input string is preserved as a single token
+ -->
+ <tokenizer class="solr.KeywordTokenizerFactory"/>
+ <!-- The LowerCase TokenFilter does what you expect, which can be
+ when you want your sorting to be case insensitive
+ -->
+ <filter class="solr.LowerCaseFilterFactory" />
+ <!-- The TrimFilter removes any leading or trailing whitespace -->
+ <filter class="solr.TrimFilterFactory" />
+ <!-- The PatternReplaceFilter gives you the flexibility to use
+ Java Regular expression to replace any sequence of characters
+ matching a pattern with an arbitrary replacement string,
+ which may include back references to portions of the original
+ string matched by the pattern.
+
+ See the Java Regular Expression documentation for more
+ information on pattern and replacement string syntax.
+
+ http://java.sun.com/j2se/1.6.0/docs/api/java/util/regex/package-summary.html
+ -->
+ <filter class="solr.PatternReplaceFilterFactory"
+ pattern="([^a-z])" replacement="" replace="all"
+ />
+ </analyzer>
+ </fieldType>
+
+ <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <!--
+ The DelimitedPayloadTokenFilter can put payloads on tokens... for example,
+ a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f
+ Attributes of the DelimitedPayloadTokenFilterFactory :
+ "delimiter" - a one character delimiter. Default is | (pipe)
+ "encoder" - how to encode the following value into a playload
+ float -> org.apache.lucene.analysis.payloads.FloatEncoder,
+ integer -> o.a.l.a.p.IntegerEncoder
+ identity -> o.a.l.a.p.IdentityEncoder
+ Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
+ -->
+ <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
+ </analyzer>
+ </fieldtype>
+
+ <!-- lowercases the entire field value, keeping it as a single token. -->
+ <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.KeywordTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory" />
+ </analyzer>
+ </fieldType>
+
+ <!--
+ Example of using PathHierarchyTokenizerFactory at index time, so
+ queries for paths match documents at that path, or in descendent paths
+ -->
+ <fieldType name="descendent_path" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.KeywordTokenizerFactory" />
+ </analyzer>
+ </fieldType>
+ <!--
+ Example of using PathHierarchyTokenizerFactory at query time, so
+ queries for paths match documents at that path, or in ancestor paths
+ -->
+ <fieldType name="ancestor_path" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.KeywordTokenizerFactory" />
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
+ </analyzer>
+ </fieldType>
+
+ <!-- since fields of this type are by default not stored or indexed,
+ any data added to them will be ignored outright. -->
+ <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
+
+ <!-- This point type indexes the coordinates as separate fields (subFields)
+ If subFieldType is defined, it references a type, and a dynamic field
+ definition is created matching *___<typename>. Alternately, if
+ subFieldSuffix is defined, that is used to create the subFields.
+ Example: if subFieldType="double", then the coordinates would be
+ indexed in fields myloc_0___double,myloc_1___double.
+ Example: if subFieldSuffix="_d" then the coordinates would be indexed
+ in fields myloc_0_d,myloc_1_d
+ The subFields are an implementation detail of the fieldType, and end
+ users normally should not need to know about them.
+ -->
+ <fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
+
+ <!-- A specialized field for geospatial search. If indexed, this fieldType must not be multivalued. -->
+ <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/>
+
+ <!--
+ A Geohash is a compact representation of a latitude longitude pair in a single field.
+ See http://wiki.apache.org/solr/SpatialSearch
+ -->
+ <fieldtype name="geohash" class="solr.GeoHashField"/>
+
+ <!-- Money/currency field type. See http://wiki.apache.org/solr/MoneyFieldType
+ Parameters:
+ defaultCurrency: Specifies the default currency if none specified. Defaults to "USD"
+ precisionStep: Specifies the precisionStep for the TrieLong field used for the amount
+ providerClass: Lets you plug in other exchange provider backend:
+ solr.FileExchangeRateProvider is the default and takes one parameter:
+ currencyConfig: name of an xml file holding exhange rates
+ solr.OpenExchangeRatesOrgProvider uses rates from openexchangerates.org:
+ ratesFileLocation: URL or path to rates JSON file (default latest.json on the web)
+ refreshInterval: Number of minutes between each rates fetch (default: 1440, min: 60)
+ -->
+<!-- <fieldType name="currency" class="solr.CurrencyField" precisionStep="8" defaultCurrency="USD" currencyConfig="currency.xml" /> - nie dziala -->
+
+
+
+ <!-- some examples for different languages (generally ordered by ISO code) -->
+
+ <!-- Arabic -->
+ <fieldType name="text_ar" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <!-- for any non-arabic -->
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ar.txt" enablePositionIncrements="true"/>
+ <!-- normalizes ﻯ to ﻱ, etc -->
+ <filter class="solr.ArabicNormalizationFilterFactory"/>
+ <filter class="solr.ArabicStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Bulgarian -->
+ <fieldType name="text_bg" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_bg.txt" enablePositionIncrements="true"/>
+ <filter class="solr.BulgarianStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Catalan -->
+ <fieldType name="text_ca" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <!-- removes l', etc -->
+ <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ca.txt"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ca.txt" enablePositionIncrements="true"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="Catalan"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- CJK bigram (see text_ja for a Japanese configuration using morphological analysis) -->
+ <fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <!-- normalize width before bigram, as e.g. half-width dakuten combine -->
+ <filter class="solr.CJKWidthFilterFactory"/>
+ <!-- for any non-CJK -->
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.CJKBigramFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Czech -->
+ <fieldType name="text_cz" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_cz.txt" enablePositionIncrements="true"/>
+ <filter class="solr.CzechStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Danish -->
+ <fieldType name="text_da" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_da.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="Danish"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- German -->
+ <fieldType name="text_de" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.GermanNormalizationFilterFactory"/>
+ <filter class="solr.GermanLightStemFilterFactory"/>
+ <!-- less aggressive: <filter class="solr.GermanMinimalStemFilterFactory"/> -->
+ <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="German2"/> -->
+ </analyzer>
+ </fieldType>
+
+ <!-- Greek -->
+ <fieldType name="text_el" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <!-- greek specific lowercase for sigma -->
+ <filter class="solr.GreekLowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
+ <filter class="solr.GreekStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Spanish -->
+ <fieldType name="text_es" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_es.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.SpanishLightStemFilterFactory"/>
+ <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Spanish"/> -->
+ </analyzer>
+ </fieldType>
+
+ <!-- Basque -->
+ <fieldType name="text_eu" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_eu.txt" enablePositionIncrements="true"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="Basque"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Persian -->
+ <fieldType name="text_fa" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <!-- for ZWNJ -->
+ <charFilter class="solr.PersianCharFilterFactory"/>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.ArabicNormalizationFilterFactory"/>
+ <filter class="solr.PersianNormalizationFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fa.txt" enablePositionIncrements="true"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Finnish -->
+ <fieldType name="text_fi" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="Finnish"/>
+ <!-- less aggressive: <filter class="solr.FinnishLightStemFilterFactory"/> -->
+ </analyzer>
+ </fieldType>
+
+ <!-- French -->
+ <fieldType name="text_fr" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <!-- removes l', etc -->
+ <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.FrenchLightStemFilterFactory"/>
+ <!-- less aggressive: <filter class="solr.FrenchMinimalStemFilterFactory"/> -->
+ <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="French"/> -->
+ </analyzer>
+ </fieldType>
+
+ <!-- Irish -->
+ <fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <!-- removes d', etc -->
+ <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt"/>
+ <!-- removes n-, etc. position increments is intentionally false! -->
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt" enablePositionIncrements="false"/>
+ <filter class="solr.IrishLowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt" enablePositionIncrements="true"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="Irish"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Galician -->
+ <fieldType name="text_gl" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt" enablePositionIncrements="true"/>
+ <filter class="solr.GalicianStemFilterFactory"/>
+ <!-- less aggressive: <filter class="solr.GalicianMinimalStemFilterFactory"/> -->
+ </analyzer>
+ </fieldType>
+
+ <!-- Hindi -->
+ <fieldType name="text_hi" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <!-- normalizes unicode representation -->
+ <filter class="solr.IndicNormalizationFilterFactory"/>
+ <!-- normalizes variation in spelling -->
+ <filter class="solr.HindiNormalizationFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt" enablePositionIncrements="true"/>
+ <filter class="solr.HindiStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Hungarian -->
+ <fieldType name="text_hu" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="Hungarian"/>
+ <!-- less aggressive: <filter class="solr.HungarianLightStemFilterFactory"/> -->
+ </analyzer>
+ </fieldType>
+
+ <!-- Armenian -->
+ <fieldType name="text_hy" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt" enablePositionIncrements="true"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="Armenian"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Indonesian -->
+ <fieldType name="text_id" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt" enablePositionIncrements="true"/>
+ <!-- for a less aggressive approach (only inflectional suffixes), set stemDerivational to false -->
+ <filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Italian -->
+ <fieldType name="text_it" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <!-- removes l', etc -->
+ <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_it.txt"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.ItalianLightStemFilterFactory"/>
+ <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> -->
+ </analyzer>
+ </fieldType>
+
+ <!-- Japanese using morphological analysis (see text_cjk for a configuration using bigramming)
+
+ NOTE: If you want to optimize search for precision, use default operator AND in your query
+ parser config with <solrQueryParser defaultOperator="AND"/> further down in this file. Use
+ OR if you would like to optimize for recall (default).
+ -->
+ <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
+ <analyzer>
+ <!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer)
+
+ Kuromoji has a search mode (default) that does segmentation useful for search. A heuristic
+ is used to segment compounds into its parts and the compound itself is kept as synonym.
+
+ Valid values for attribute mode are:
+ normal: regular segmentation
+ search: segmentation useful for search with synonyms compounds (default)
+ extended: same as search mode, but unigrams unknown words (experimental)
+
+ For some applications it might be good to use search mode for indexing and normal mode for
+ queries to reduce recall and prevent parts of compounds from being matched and highlighted.
+ Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query.
+
+ Kuromoji also has a convenient user dictionary feature that allows overriding the statistical
+ model with your own entries for segmentation, part-of-speech tags and readings without a need
+ to specify weights. Notice that user dictionaries have not been subject to extensive testing.
+
+ User dictionary attributes are:
+ userDictionary: user dictionary filename
+ userDictionaryEncoding: user dictionary encoding (default is UTF-8)
+
+ See lang/userdict_ja.txt for a sample user dictionary file.
+
+ Punctuation characters are discarded by default. Use discardPunctuation="false" to keep them.
+
+ See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.
+ -->
+ <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
+ <!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>-->
+ <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
+ <filter class="solr.JapaneseBaseFormFilterFactory"/>
+ <!-- Removes tokens with certain part-of-speech tags -->
+ <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/>
+ <!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) -->
+ <filter class="solr.CJKWidthFilterFactory"/>
+ <!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" />
+ <!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) -->
+ <filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
+ <!-- Lower-cases romaji characters -->
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Latvian -->
+ <fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt" enablePositionIncrements="true"/>
+ <filter class="solr.LatvianStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Dutch -->
+ <fieldType name="text_nl" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict_nl.txt" ignoreCase="false"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="Dutch"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Norwegian -->
+ <fieldType name="text_no" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/>
+ <!-- less aggressive: <filter class="solr.NorwegianLightStemFilterFactory"/> -->
+ <!-- singular/plural: <filter class="solr.NorwegianMinimalStemFilterFactory"/> -->
+ </analyzer>
+ </fieldType>
+
+ <!-- Polish -->
+ <fieldType name="text_pl" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pl.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" />
+ <!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
+ <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->
+ <!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> -->
+ </analyzer>
+ </fieldType>
+
+
+ <!-- Portuguese -->
+ <fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.PortugueseLightStemFilterFactory"/>
+ <!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
+ <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->
+ <!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> -->
+ </analyzer>
+ </fieldType>
+
+ <!-- Romanian -->
+ <fieldType name="text_ro" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt" enablePositionIncrements="true"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="Romanian"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Russian -->
+ <fieldType name="text_ru" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="Russian"/>
+ <!-- less aggressive: <filter class="solr.RussianLightStemFilterFactory"/> -->
+ </analyzer>
+ </fieldType>
+
+ <!-- Swedish -->
+ <fieldType name="text_sv" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="Swedish"/>
+ <!-- less aggressive: <filter class="solr.SwedishLightStemFilterFactory"/> -->
+ </analyzer>
+ </fieldType>
+
+ <!-- Thai -->
+ <fieldType name="text_th" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.ThaiWordFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt" enablePositionIncrements="true"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Turkish -->
+ <fieldType name="text_tr" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.TurkishLowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" enablePositionIncrements="true"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="Turkish"/>
+ </analyzer>
+ </fieldType>
+
+ </types>
+
+ <!-- Similarity is the scoring routine for each document vs. a query.
+ A custom Similarity or SimilarityFactory may be specified here, but
+ the default is fine for most applications.
+ For more info: http://wiki.apache.org/solr/SchemaXml#Similarity
+ -->
+ <!--
+ <similarity class="com.example.solr.CustomSimilarityFactory">
+ <str name="paramkey">param value</str>
+ </similarity>
+ -->
+
+</schema>
--- /dev/null
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!--
+ For more details about configurations options that may appear in
+ this file, see http://wiki.apache.org/solr/SolrConfigXml.
+-->
+<config>
+ <!-- In all configuration below, a prefix of "solr." for class names
+ is an alias that causes solr to search appropriate packages,
+ including org.apache.solr.(search|update|request|core|analysis)
+
+ You may also specify a fully qualified Java classname if you
+ have your own custom plugins.
+ -->
+
+ <!-- Controls what version of Lucene various components of Solr
+ adhere to. Generally, you want to use the latest version to
+ get all bug fixes and improvements. It is highly recommended
+ that you fully re-index after changing this setting as it can
+ affect both how text is indexed and queried.
+ -->
+ <luceneMatchVersion>LUCENE_40</luceneMatchVersion>
+
+ <!-- lib directives can be used to instruct Solr to load an Jars
+ identified and use them to resolve any "plugins" specified in
+ your solrconfig.xml or schema.xml (ie: Analyzers, Request
+ Handlers, etc...).
+
+ All directories and paths are resolved relative to the
+ instanceDir.
+
+ If a "./lib" directory exists in your instanceDir, all files
+ found in it are included as if you had used the following
+ syntax...
+
+ <lib dir="./lib" />
+ -->
+
+ <!-- A 'dir' option by itself adds any files found in the directory
+ to the classpath, this is useful for including all jars in a
+ directory.
+ -->
+ <!--
+ <lib dir="../add-everything-found-in-this-dir-to-the-classpath" />
+ -->
+
+ <!-- When a 'regex' is specified in addition to a 'dir', only the
+ files in that directory which completely match the regex
+ (anchored on both ends) will be included.
+ -->
+ <lib dir="../../../dist/" regex="apache-solr-cell-\d.*\.jar" />
+ <lib dir="../../../contrib/extraction/lib" regex=".*\.jar" />
+
+ <lib dir="../../../dist/" regex="apache-solr-clustering-\d.*\.jar" />
+ <lib dir="../../../contrib/clustering/lib/" regex=".*\.jar" />
+
+ <lib dir="../../../dist/" regex="apache-solr-langid-\d.*\.jar" />
+ <lib dir="../../../contrib/langid/lib/" regex=".*\.jar" />
+
+ <lib dir="../../../dist/" regex="apache-solr-velocity-\d.*\.jar" />
+ <lib dir="../../../contrib/velocity/lib" regex=".*\.jar" />
+
+ <!-- If a 'dir' option (with or without a regex) is used and nothing
+ is found that matches, it will be ignored
+ -->
+ <lib dir="/total/crap/dir/ignored" />
+
+ <!-- an exact 'path' can be used instead of a 'dir' to specify a
+ specific file. This will cause a serious error to be logged if
+ it can't be loaded.
+ -->
+ <!--
+ <lib path="../a-jar-that-does-not-exist.jar" />
+ -->
+
+ <!-- Data Directory
+
+ Used to specify an alternate directory to hold all index data
+ other than the default ./data under the Solr home. If
+ replication is in use, this should match the replication
+ configuration.
+ -->
+ <dataDir>${solr.data.dir:}</dataDir>
+
+
+ <!-- The DirectoryFactory to use for indexes.
+
+ solr.StandardDirectoryFactory is filesystem
+ based and tries to pick the best implementation for the current
+ JVM and platform. solr.NRTCachingDirectoryFactory, the default,
+ wraps solr.StandardDirectoryFactory and caches small files in memory
+ for better NRT performance.
+
+ One can force a particular implementation via solr.MMapDirectoryFactory,
+ solr.NIOFSDirectoryFactory, or solr.SimpleFSDirectoryFactory.
+
+ solr.RAMDirectoryFactory is memory based, not
+ persistent, and doesn't work with replication.
+ -->
+ <directoryFactory name="DirectoryFactory"
+ class="${solr.directoryFactory:solr.NRTCachingDirectoryFactory}"/>
+
+ <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Index Config - These settings control low-level behavior of indexing
+ Most example settings here show the default value, but are commented
+ out, to more easily see where customizations have been made.
+
+ Note: This replaces <indexDefaults> and <mainIndex> from older versions
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
+ <indexConfig>
+ <!-- maxFieldLength was removed in 4.0. To get similar behavior, include a
+ LimitTokenCountFilterFactory in your fieldType definition. E.g.
+ <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="10000"/>
+ -->
+ <!-- Maximum time to wait for a write lock (ms) for an IndexWriter. Default: 1000 -->
+ <!-- <writeLockTimeout>1000</writeLockTimeout> -->
+
+ <!-- Expert: Enabling compound file will use less files for the index,
+ using fewer file descriptors on the expense of performance decrease.
+ Default in Lucene is "true". Default in Solr is "false" (since 3.6) -->
+ <!-- <useCompoundFile>false</useCompoundFile> -->
+
+ <!-- ramBufferSizeMB sets the amount of RAM that may be used by Lucene
+ indexing for buffering added documents and deletions before they are
+ flushed to the Directory.
+ maxBufferedDocs sets a limit on the number of documents buffered
+ before flushing.
+ If both ramBufferSizeMB and maxBufferedDocs is set, then
+ Lucene will flush based on whichever limit is hit first. -->
+ <!-- <ramBufferSizeMB>32</ramBufferSizeMB> -->
+ <!-- <maxBufferedDocs>1000</maxBufferedDocs> -->
+
+ <!-- Expert: Merge Policy
+ The Merge Policy in Lucene controls how merging of segments is done.
+ The default since Solr/Lucene 3.3 is TieredMergePolicy.
+ The default since Lucene 2.3 was the LogByteSizeMergePolicy,
+ Even older versions of Lucene used LogDocMergePolicy.
+ -->
+ <!--
+ <mergePolicy class="org.apache.lucene.index.TieredMergePolicy">
+ <int name="maxMergeAtOnce">10</int>
+ <int name="segmentsPerTier">10</int>
+ </mergePolicy>
+ -->
+
+ <!-- Merge Factor
+ The merge factor controls how many segments will get merged at a time.
+ For TieredMergePolicy, mergeFactor is a convenience parameter which
+ will set both MaxMergeAtOnce and SegmentsPerTier at once.
+ For LogByteSizeMergePolicy, mergeFactor decides how many new segments
+ will be allowed before they are merged into one.
+ Default is 10 for both merge policies.
+ -->
+ <!--
+ <mergeFactor>10</mergeFactor>
+ -->
+
+ <!-- Expert: Merge Scheduler
+ The Merge Scheduler in Lucene controls how merges are
+ performed. The ConcurrentMergeScheduler (Lucene 2.3 default)
+ can perform merges in the background using separate threads.
+ The SerialMergeScheduler (Lucene 2.2 default) does not.
+ -->
+ <!--
+ <mergeScheduler class="org.apache.lucene.index.ConcurrentMergeScheduler"/>
+ -->
+
+ <!-- LockFactory
+
+ This option specifies which Lucene LockFactory implementation
+ to use.
+
+ single = SingleInstanceLockFactory - suggested for a
+ read-only index or when there is no possibility of
+ another process trying to modify the index.
+ native = NativeFSLockFactory - uses OS native file locking.
+ Do not use when multiple solr webapps in the same
+ JVM are attempting to share a single index.
+ simple = SimpleFSLockFactory - uses a plain file for locking
+
+ Defaults: 'native' is default for Solr3.6 and later, otherwise
+ 'simple' is the default
+
+ More details on the nuances of each LockFactory...
+ http://wiki.apache.org/lucene-java/AvailableLockFactories
+ -->
+ <!-- <lockType>native</lockType> -->
+
+ <!-- Unlock On Startup
+
+ If true, unlock any held write or commit locks on startup.
+ This defeats the locking mechanism that allows multiple
+ processes to safely access a lucene index, and should be used
+ with care. Default is "false".
+
+ This is not needed if lock type is 'none' or 'single'
+ -->
+ <!--
+ <unlockOnStartup>false</unlockOnStartup>
+ -->
+
+ <!-- Expert: Controls how often Lucene loads terms into memory
+ Default is 128 and is likely good for most everyone.
+ -->
+ <!-- <termIndexInterval>128</termIndexInterval> -->
+
+ <!-- If true, IndexReaders will be reopened (often more efficient)
+ instead of closed and then opened. Default: true
+ -->
+ <!--
+ <reopenReaders>true</reopenReaders>
+ -->
+
+ <!-- Commit Deletion Policy
+
+ Custom deletion policies can be specified here. The class must
+ implement org.apache.lucene.index.IndexDeletionPolicy.
+
+ http://lucene.apache.org/java/3_5_0/api/core/org/apache/lucene/index/IndexDeletionPolicy.html
+
+ The default Solr IndexDeletionPolicy implementation supports
+ deleting index commit points on number of commits, age of
+ commit point and optimized status.
+
+ The latest commit point should always be preserved regardless
+ of the criteria.
+ -->
+ <!--
+ <deletionPolicy class="solr.SolrDeletionPolicy">
+ -->
+ <!-- The number of commit points to be kept -->
+ <!-- <str name="maxCommitsToKeep">1</str> -->
+ <!-- The number of optimized commit points to be kept -->
+ <!-- <str name="maxOptimizedCommitsToKeep">0</str> -->
+ <!--
+ Delete all commit points once they have reached the given age.
+ Supports DateMathParser syntax e.g.
+ -->
+ <!--
+ <str name="maxCommitAge">30MINUTES</str>
+ <str name="maxCommitAge">1DAY</str>
+ -->
+ <!--
+ </deletionPolicy>
+ -->
+
+ <!-- Lucene Infostream
+
+ To aid in advanced debugging, Lucene provides an "InfoStream"
+ of detailed information when indexing.
+
+ Setting The value to true will instruct the underlying Lucene
+ IndexWriter to write its debugging info the specified file
+ -->
+ <!-- <infoStream file="INFOSTREAM.txt">false</infoStream> -->
+ </indexConfig>
+
+
+ <!-- JMX
+
+ This example enables JMX if and only if an existing MBeanServer
+ is found, use this if you want to configure JMX through JVM
+ parameters. Remove this to disable exposing Solr configuration
+ and statistics to JMX.
+
+ For more details see http://wiki.apache.org/solr/SolrJmx
+ -->
+ <jmx />
+ <!-- If you want to connect to a particular server, specify the
+ agentId
+ -->
+ <!-- <jmx agentId="myAgent" /> -->
+ <!-- If you want to start a new MBeanServer, specify the serviceUrl -->
+ <!-- <jmx serviceUrl="service:jmx:rmi:///jndi/rmi://localhost:9999/solr"/>
+ -->
+
+ <!-- The default high-performance update handler -->
+ <updateHandler class="solr.DirectUpdateHandler2">
+
+ <!-- AutoCommit
+
+ Perform a hard commit automatically under certain conditions.
+ Instead of enabling autoCommit, consider using "commitWithin"
+ when adding documents.
+
+ http://wiki.apache.org/solr/UpdateXmlMessages
+
+ maxDocs - Maximum number of documents to add since the last
+ commit before automatically triggering a new commit.
+
+ maxTime - Maximum amount of time in ms that is allowed to pass
+ since a document was added before automaticly
+ triggering a new commit.
+ openSearcher - if false, the commit causes recent index changes
+ to be flushed to stable storage, but does not cause a new
+ searcher to be opened to make those changes visible.
+ -->
+ <autoCommit>
+ <maxTime>15000</maxTime>
+ <openSearcher>false</openSearcher>
+ </autoCommit>
+
+ <!-- softAutoCommit is like autoCommit except it causes a
+ 'soft' commit which only ensures that changes are visible
+ but does not ensure that data is synced to disk. This is
+ faster and more near-realtime friendly than a hard commit.
+ -->
+ <!--
+ <autoSoftCommit>
+ <maxTime>1000</maxTime>
+ </autoSoftCommit>
+ -->
+
+ <!-- Update Related Event Listeners
+
+ Various IndexWriter related events can trigger Listeners to
+ take actions.
+
+ postCommit - fired after every commit or optimize command
+ postOptimize - fired after every optimize command
+ -->
+ <!-- The RunExecutableListener executes an external command from a
+ hook such as postCommit or postOptimize.
+
+ exe - the name of the executable to run
+ dir - dir to use as the current working directory. (default=".")
+ wait - the calling thread waits until the executable returns.
+ (default="true")
+ args - the arguments to pass to the program. (default is none)
+ env - environment variables to set. (default is none)
+ -->
+ <!-- This example shows how RunExecutableListener could be used
+ with the script based replication...
+ http://wiki.apache.org/solr/CollectionDistribution
+ -->
+ <!--
+ <listener event="postCommit" class="solr.RunExecutableListener">
+ <str name="exe">solr/bin/snapshooter</str>
+ <str name="dir">.</str>
+ <bool name="wait">true</bool>
+ <arr name="args"> <str>arg1</str> <str>arg2</str> </arr>
+ <arr name="env"> <str>MYVAR=val1</str> </arr>
+ </listener>
+ -->
+
+ <!-- Enables a transaction log, currently used for real-time get.
+ "dir" - the target directory for transaction logs, defaults to the
+ solr data directory. -->
+ <updateLog>
+ <str name="dir">${solr.data.dir:}</str>
+ </updateLog>
+
+
+ </updateHandler>
+
+ <!-- IndexReaderFactory
+
+ Use the following format to specify a custom IndexReaderFactory,
+ which allows for alternate IndexReader implementations.
+
+ ** Experimental Feature **
+
+ Please note - Using a custom IndexReaderFactory may prevent
+ certain other features from working. The API to
+ IndexReaderFactory may change without warning or may even be
+ removed from future releases if the problems cannot be
+ resolved.
+
+
+ ** Features that may not work with custom IndexReaderFactory **
+
+ The ReplicationHandler assumes a disk-resident index. Using a
+ custom IndexReader implementation may cause incompatibility
+ with ReplicationHandler and may cause replication to not work
+ correctly. See SOLR-1366 for details.
+
+ -->
+ <!--
+ <indexReaderFactory name="IndexReaderFactory" class="package.class">
+ <str name="someArg">Some Value</str>
+ </indexReaderFactory >
+ -->
+ <!-- By explicitly declaring the Factory, the termIndexDivisor can
+ be specified.
+ -->
+ <!--
+ <indexReaderFactory name="IndexReaderFactory"
+ class="solr.StandardIndexReaderFactory">
+ <int name="setTermIndexDivisor">12</int>
+ </indexReaderFactory >
+ -->
+
+ <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Query section - these settings control query time things like caches
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
+ <query>
+ <!-- Max Boolean Clauses
+
+ Maximum number of clauses in each BooleanQuery, an exception
+ is thrown if exceeded.
+
+ ** WARNING **
+
+ This option actually modifies a global Lucene property that
+ will affect all SolrCores. If multiple solrconfig.xml files
+ disagree on this property, the value at any given moment will
+ be based on the last SolrCore to be initialized.
+
+ -->
+ <maxBooleanClauses>1024</maxBooleanClauses>
+
+
+ <!-- Solr Internal Query Caches
+
+ There are two implementations of cache available for Solr,
+ LRUCache, based on a synchronized LinkedHashMap, and
+ FastLRUCache, based on a ConcurrentHashMap.
+
+ FastLRUCache has faster gets and slower puts in single
+ threaded operation and thus is generally faster than LRUCache
+ when the hit ratio of the cache is high (> 75%), and may be
+ faster under other scenarios on multi-cpu systems.
+ -->
+
+ <!-- Filter Cache
+
+ Cache used by SolrIndexSearcher for filters (DocSets),
+ unordered sets of *all* documents that match a query. When a
+ new searcher is opened, its caches may be prepopulated or
+ "autowarmed" using data from caches in the old searcher.
+ autowarmCount is the number of items to prepopulate. For
+ LRUCache, the autowarmed items will be the most recently
+ accessed items.
+
+ Parameters:
+ class - the SolrCache implementation LRUCache or
+ (LRUCache or FastLRUCache)
+ size - the maximum number of entries in the cache
+ initialSize - the initial capacity (number of entries) of
+ the cache. (see java.util.HashMap)
+ autowarmCount - the number of entries to prepopulate from
+ and old cache.
+ -->
+ <filterCache class="solr.FastLRUCache"
+ size="512"
+ initialSize="512"
+ autowarmCount="0"/>
+
+ <!-- Query Result Cache
+
+ Caches results of searches - ordered lists of document ids
+ (DocList) based on a query, a sort, and the range of documents requested.
+ -->
+ <queryResultCache class="solr.LRUCache"
+ size="512"
+ initialSize="512"
+ autowarmCount="0"/>
+
+ <!-- Document Cache
+
+ Caches Lucene Document objects (the stored fields for each
+ document). Since Lucene internal document ids are transient,
+ this cache will not be autowarmed.
+ -->
+ <documentCache class="solr.LRUCache"
+ size="512"
+ initialSize="512"
+ autowarmCount="0"/>
+
+ <!-- Field Value Cache
+
+ Cache used to hold field values that are quickly accessible
+ by document id. The fieldValueCache is created by default
+ even if not configured here.
+ -->
+ <!--
+ <fieldValueCache class="solr.FastLRUCache"
+ size="512"
+ autowarmCount="128"
+ showItems="32" />
+ -->
+
+ <!-- Custom Cache
+
+ Example of a generic cache. These caches may be accessed by
+ name through SolrIndexSearcher.getCache(),cacheLookup(), and
+ cacheInsert(). The purpose is to enable easy caching of
+ user/application level data. The regenerator argument should
+ be specified as an implementation of solr.CacheRegenerator
+ if autowarming is desired.
+ -->
+ <!--
+ <cache name="myUserCache"
+ class="solr.LRUCache"
+ size="4096"
+ initialSize="1024"
+ autowarmCount="1024"
+ regenerator="com.mycompany.MyRegenerator"
+ />
+ -->
+
+
+ <!-- Lazy Field Loading
+
+ If true, stored fields that are not requested will be loaded
+ lazily. This can result in a significant speed improvement
+ if the usual case is to not load all stored fields,
+ especially if the skipped fields are large compressed text
+ fields.
+ -->
+ <enableLazyFieldLoading>true</enableLazyFieldLoading>
+
+ <!-- Use Filter For Sorted Query
+
+ A possible optimization that attempts to use a filter to
+ satisfy a search. If the requested sort does not include
+ score, then the filterCache will be checked for a filter
+ matching the query. If found, the filter will be used as the
+ source of document ids, and then the sort will be applied to
+ that.
+
+ For most situations, this will not be useful unless you
+ frequently get the same search repeatedly with different sort
+ options, and none of them ever use "score"
+ -->
+ <!--
+ <useFilterForSortedQuery>true</useFilterForSortedQuery>
+ -->
+
+ <!-- Result Window Size
+
+ An optimization for use with the queryResultCache. When a search
+ is requested, a superset of the requested number of document ids
+ are collected. For example, if a search for a particular query
+ requests matching documents 10 through 19, and queryWindowSize is 50,
+ then documents 0 through 49 will be collected and cached. Any further
+ requests in that range can be satisfied via the cache.
+ -->
+ <queryResultWindowSize>20</queryResultWindowSize>
+
+ <!-- Maximum number of documents to cache for any entry in the
+ queryResultCache.
+ -->
+ <queryResultMaxDocsCached>200</queryResultMaxDocsCached>
+
+ <!-- Query Related Event Listeners
+
+ Various IndexSearcher related events can trigger Listeners to
+ take actions.
+
+ newSearcher - fired whenever a new searcher is being prepared
+ and there is a current searcher handling requests (aka
+ registered). It can be used to prime certain caches to
+ prevent long request times for certain requests.
+
+ firstSearcher - fired whenever a new searcher is being
+ prepared but there is no current registered searcher to handle
+ requests or to gain autowarming data from.
+
+
+ -->
+ <!-- QuerySenderListener takes an array of NamedList and executes a
+ local query request for each NamedList in sequence.
+ -->
+ <listener event="newSearcher" class="solr.QuerySenderListener">
+ <arr name="queries">
+ <!--
+ <lst><str name="q">solr</str><str name="sort">price asc</str></lst>
+ <lst><str name="q">rocks</str><str name="sort">weight asc</str></lst>
+ -->
+ </arr>
+ </listener>
+ <listener event="firstSearcher" class="solr.QuerySenderListener">
+ <arr name="queries">
+ <lst>
+ <str name="q">static firstSearcher warming in solrconfig.xml</str>
+ </lst>
+ </arr>
+ </listener>
+
+ <!-- Use Cold Searcher
+
+ If a search request comes in and there is no current
+ registered searcher, then immediately register the still
+ warming searcher and use it. If "false" then all requests
+ will block until the first searcher is done warming.
+ -->
+ <useColdSearcher>false</useColdSearcher>
+
+ <!-- Max Warming Searchers
+
+ Maximum number of searchers that may be warming in the
+ background concurrently. An error is returned if this limit
+ is exceeded.
+
+ Recommend values of 1-2 for read-only slaves, higher for
+ masters w/o cache warming.
+ -->
+ <maxWarmingSearchers>2</maxWarmingSearchers>
+
+ </query>
+
+
+ <!-- Request Dispatcher
+
+ This section contains instructions for how the SolrDispatchFilter
+ should behave when processing requests for this SolrCore.
+
+ handleSelect is a legacy option that affects the behavior of requests
+ such as /select?qt=XXX
+
+ handleSelect="true" will cause the SolrDispatchFilter to process
+ the request and dispatch the query to a handler specified by the
+ "qt" param, assuming "/select" isn't already registered.
+
+ handleSelect="false" will cause the SolrDispatchFilter to
+ ignore "/select" requests, resulting in a 404 unless a handler
+ is explicitly registered with the name "/select"
+
+ handleSelect="true" is not recommended for new users, but is the default
+ for backwards compatibility
+ -->
+ <requestDispatcher handleSelect="false" >
+ <!-- Request Parsing
+
+ These settings indicate how Solr Requests may be parsed, and
+ what restrictions may be placed on the ContentStreams from
+ those requests
+
+ enableRemoteStreaming - enables use of the stream.file
+ and stream.url parameters for specifying remote streams.
+
+ multipartUploadLimitInKB - specifies the max size of
+ Multipart File Uploads that Solr will allow in a Request.
+
+ *** WARNING ***
+ The settings below authorize Solr to fetch remote files, You
+ should make sure your system has some authentication before
+ using enableRemoteStreaming="true"
+
+ -->
+ <requestParsers enableRemoteStreaming="true"
+ multipartUploadLimitInKB="2048000" />
+
+ <!-- HTTP Caching
+
+ Set HTTP caching related parameters (for proxy caches and clients).
+
+ The options below instruct Solr not to output any HTTP Caching
+ related headers
+ -->
+ <httpCaching never304="true" />
+ <!-- If you include a <cacheControl> directive, it will be used to
+ generate a Cache-Control header (as well as an Expires header
+ if the value contains "max-age=")
+
+ By default, no Cache-Control header is generated.
+
+ You can use the <cacheControl> option even if you have set
+ never304="true"
+ -->
+ <!--
+ <httpCaching never304="true" >
+ <cacheControl>max-age=30, public</cacheControl>
+ </httpCaching>
+ -->
+ <!-- To enable Solr to respond with automatically generated HTTP
+ Caching headers, and to response to Cache Validation requests
+ correctly, set the value of never304="false"
+
+ This will cause Solr to generate Last-Modified and ETag
+ headers based on the properties of the Index.
+
+ The following options can also be specified to affect the
+ values of these headers...
+
+ lastModFrom - the default value is "openTime" which means the
+ Last-Modified value (and validation against If-Modified-Since
+ requests) will all be relative to when the current Searcher
+ was opened. You can change it to lastModFrom="dirLastMod" if
+ you want the value to exactly correspond to when the physical
+ index was last modified.
+
+ etagSeed="..." is an option you can change to force the ETag
+ header (and validation against If-None-Match requests) to be
+ different even if the index has not changed (ie: when making
+ significant changes to your config file)
+
+ (lastModifiedFrom and etagSeed are both ignored if you use
+ the never304="true" option)
+ -->
+ <!--
+ <httpCaching lastModifiedFrom="openTime"
+ etagSeed="Solr">
+ <cacheControl>max-age=30, public</cacheControl>
+ </httpCaching>
+ -->
+ </requestDispatcher>
+
+ <!-- Request Handlers
+
+ http://wiki.apache.org/solr/SolrRequestHandler
+
+ Incoming queries will be dispatched to a specific handler by name
+ based on the path specified in the request.
+
+ Legacy behavior: If the request path uses "/select" but no Request
+ Handler has that name, and if handleSelect="true" has been specified in
+ the requestDispatcher, then the Request Handler is dispatched based on
+ the qt parameter. Handlers without a leading '/' are accessed this way
+ like so: http://host/app/[core/]select?qt=name If no qt is
+ given, then the requestHandler that declares default="true" will be
+ used or the one named "standard".
+
+ If a Request Handler is declared with startup="lazy", then it will
+ not be initialized until the first request that uses it.
+
+ -->
+ <!-- SearchHandler
+
+ http://wiki.apache.org/solr/SearchHandler
+
+ For processing Search Queries, the primary Request Handler
+ provided with Solr is "SearchHandler" It delegates to a sequent
+ of SearchComponents (see below) and supports distributed
+ queries across multiple shards
+ -->
+ <requestHandler name="/select" class="solr.SearchHandler">
+ <!-- default values for query parameters can be specified, these
+ will be overridden by parameters in the request
+ -->
+ <lst name="defaults">
+ <str name="echoParams">explicit</str>
+ <int name="rows">50</int>
+ <str name="df">text</str>
+ <bool name="tv">true</bool>
+ </lst>
+ <!-- In addition to defaults, "appends" params can be specified
+ to identify values which should be appended to the list of
+ multi-val params from the query (or the existing "defaults").
+ -->
+ <!-- In this example, the param "fq=instock:true" would be appended to
+ any query time fq params the user may specify, as a mechanism for
+ partitioning the index, independent of any user selected filtering
+ that may also be desired (perhaps as a result of faceted searching).
+
+ NOTE: there is *absolutely* nothing a client can do to prevent these
+ "appends" values from being used, so don't use this mechanism
+ unless you are sure you always want it.
+ -->
+ <!--
+ <lst name="appends">
+ <str name="fq">inStock:true</str>
+ </lst>
+ -->
+ <!-- "invariants" are a way of letting the Solr maintainer lock down
+ the options available to Solr clients. Any params values
+ specified here are used regardless of what values may be specified
+ in either the query, the "defaults", or the "appends" params.
+
+ In this example, the facet.field and facet.query params would
+ be fixed, limiting the facets clients can use. Faceting is
+ not turned on by default - but if the client does specify
+ facet=true in the request, these are the only facets they
+ will be able to see counts for; regardless of what other
+ facet.field or facet.query params they may specify.
+
+ NOTE: there is *absolutely* nothing a client can do to prevent these
+ "invariants" values from being used, so don't use this mechanism
+ unless you are sure you always want it.
+ -->
+ <!--
+ <lst name="invariants">
+ <str name="facet.field">cat</str>
+ <str name="facet.field">manu_exact</str>
+ <str name="facet.query">price:[* TO 500]</str>
+ <str name="facet.query">price:[500 TO *]</str>
+ </lst>
+ -->
+ <!-- If the default list of SearchComponents is not desired, that
+ list can either be overridden completely, or components can be
+ prepended or appended to the default list. (see below)
+ -->
+ <!--
+ <arr name="components">
+ <str>nameOfCustomComponent1</str>
+ <str>nameOfCustomComponent2</str>
+ </arr>
+ -->
+ <arr name="last-components">
+ <str>tvComponent</str>
+ </arr>
+ </requestHandler>
+
+ <!-- A request handler that returns indented JSON by default -->
+ <requestHandler name="/query" class="solr.SearchHandler">
+ <lst name="defaults">
+ <str name="echoParams">explicit</str>
+ <str name="wt">json</str>
+ <str name="indent">true</str>
+ <str name="df">text</str>
+ </lst>
+ </requestHandler>
+
+
+ <!-- realtime get handler, guaranteed to return the latest stored fields of
+ any document, without the need to commit or open a new searcher. The
+ current implementation relies on the updateLog feature being enabled. -->
+ <requestHandler name="/get" class="solr.RealTimeGetHandler">
+ <lst name="defaults">
+ <str name="omitHeader">true</str>
+ <str name="wt">json</str>
+ <str name="indent">true</str>
+ </lst>
+ </requestHandler>
+
+
+ <!-- A Robust Example
+
+ This example SearchHandler declaration shows off usage of the
+ SearchHandler with many defaults declared
+
+ Note that multiple instances of the same Request Handler
+ (SearchHandler) can be registered multiple times with different
+ names (and different init parameters)
+ -->
+ <requestHandler name="/browse" class="solr.SearchHandler">
+ <lst name="defaults">
+ <str name="echoParams">explicit</str>
+
+ <!-- VelocityResponseWriter settings -->
+ <str name="wt">velocity</str>
+ <str name="v.template">browse</str>
+ <str name="v.layout">layout</str>
+ <str name="title">Solritas</str>
+
+ <!-- Query settings -->
+ <str name="defType">edismax</str>
+ <str name="qf">
+ text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
+ title^10.0 description^5.0 keywords^5.0 author^2.0 resourcename^1.0
+ </str>
+ <str name="df">text</str>
+ <str name="mm">100%</str>
+ <str name="q.alt">*:*</str>
+ <str name="rows">10</str>
+ <str name="fl">*,score</str>
+
+ <str name="mlt.qf">
+ text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
+ title^10.0 description^5.0 keywords^5.0 author^2.0 resourcename^1.0
+ </str>
+ <str name="mlt.fl">text,features,name,sku,id,manu,cat,title,description,keywords,author,resourcename</str>
+ <int name="mlt.count">3</int>
+
+ <!-- Faceting defaults -->
+ <str name="facet">on</str>
+ <str name="facet.field">cat</str>
+ <str name="facet.field">manu_exact</str>
+ <str name="facet.field">content_type</str>
+ <str name="facet.field">author_s</str>
+ <str name="facet.query">ipod</str>
+ <str name="facet.query">GB</str>
+ <str name="facet.mincount">1</str>
+ <str name="facet.pivot">cat,inStock</str>
+ <str name="facet.range.other">after</str>
+ <str name="facet.range">price</str>
+ <int name="f.price.facet.range.start">0</int>
+ <int name="f.price.facet.range.end">600</int>
+ <int name="f.price.facet.range.gap">50</int>
+ <str name="facet.range">popularity</str>
+ <int name="f.popularity.facet.range.start">0</int>
+ <int name="f.popularity.facet.range.end">10</int>
+ <int name="f.popularity.facet.range.gap">3</int>
+ <str name="facet.range">manufacturedate_dt</str>
+ <str name="f.manufacturedate_dt.facet.range.start">NOW/YEAR-10YEARS</str>
+ <str name="f.manufacturedate_dt.facet.range.end">NOW</str>
+ <str name="f.manufacturedate_dt.facet.range.gap">+1YEAR</str>
+ <str name="f.manufacturedate_dt.facet.range.other">before</str>
+ <str name="f.manufacturedate_dt.facet.range.other">after</str>
+
+ <!-- Highlighting defaults -->
+ <str name="hl">on</str>
+ <str name="hl.fl">content features title name</str>
+ <str name="hl.encoder">html</str>
+ <str name="hl.simple.pre"><b></str>
+ <str name="hl.simple.post"></b></str>
+ <str name="f.title.hl.fragsize">0</str>
+ <str name="f.title.hl.alternateField">title</str>
+ <str name="f.name.hl.fragsize">0</str>
+ <str name="f.name.hl.alternateField">name</str>
+ <str name="f.content.hl.snippets">3</str>
+ <str name="f.content.hl.fragsize">200</str>
+ <str name="f.content.hl.alternateField">content</str>
+ <str name="f.content.hl.maxAlternateFieldLength">750</str>
+
+ <!-- Spell checking defaults -->
+ <str name="spellcheck">on</str>
+ <str name="spellcheck.extendedResults">false</str>
+ <str name="spellcheck.count">5</str>
+ <str name="spellcheck.alternativeTermCount">2</str>
+ <str name="spellcheck.maxResultsForSuggest">5</str>
+ <str name="spellcheck.collate">true</str>
+ <str name="spellcheck.collateExtendedResults">true</str>
+ <str name="spellcheck.maxCollationTries">5</str>
+ <str name="spellcheck.maxCollations">3</str>
+ </lst>
+
+ <!-- append spellchecking to our list of components -->
+ <arr name="last-components">
+ <str>spellcheck</str>
+ </arr>
+ </requestHandler>
+
+
+ <!-- Update Request Handler.
+
+ http://wiki.apache.org/solr/UpdateXmlMessages
+
+ The canonical Request Handler for Modifying the Index through
+ commands specified using XML, JSON, CSV, or JAVABIN
+
+ Note: Since solr1.1 requestHandlers requires a valid content
+ type header if posted in the body. For example, curl now
+ requires: -H 'Content-type:text/xml; charset=utf-8'
+
+ To override the request content type and force a specific
+ Content-type, use the request parameter:
+ ?update.contentType=text/csv
+
+ This handler will pick a response format to match the input
+ if the 'wt' parameter is not explicit
+ -->
+ <requestHandler name="/update" class="solr.UpdateRequestHandler">
+ <!-- See below for information on defining
+ updateRequestProcessorChains that can be used by name
+ on each Update Request
+ -->
+ <!--
+ <lst name="defaults">
+ <str name="update.chain">dedupe</str>
+ </lst>
+ -->
+ </requestHandler>
+
+
+ <!-- Solr Cell Update Request Handler
+
+ http://wiki.apache.org/solr/ExtractingRequestHandler
+
+ -->
+ <requestHandler name="/update/extract"
+ startup="lazy"
+ class="solr.extraction.ExtractingRequestHandler" >
+ <lst name="defaults">
+ <str name="lowernames">true</str>
+ <str name="uprefix">ignored_</str>
+
+ <!-- capture link hrefs but ignore div attributes -->
+ <str name="captureAttr">true</str>
+ <str name="fmap.a">links</str>
+ <str name="fmap.div">ignored_</str>
+ </lst>
+ </requestHandler>
+
+
+ <!-- Field Analysis Request Handler
+
+ RequestHandler that provides much the same functionality as
+ analysis.jsp. Provides the ability to specify multiple field
+ types and field names in the same request and outputs
+ index-time and query-time analysis for each of them.
+
+ Request parameters are:
+ analysis.fieldname - field name whose analyzers are to be used
+
+ analysis.fieldtype - field type whose analyzers are to be used
+ analysis.fieldvalue - text for index-time analysis
+ q (or analysis.q) - text for query time analysis
+ analysis.showmatch (true|false) - When set to true and when
+ query analysis is performed, the produced tokens of the
+ field value analysis will be marked as "matched" for every
+ token that is produces by the query analysis
+ -->
+ <requestHandler name="/analysis/field"
+ startup="lazy"
+ class="solr.FieldAnalysisRequestHandler" />
+
+
+ <!-- Document Analysis Handler
+
+ http://wiki.apache.org/solr/AnalysisRequestHandler
+
+ An analysis handler that provides a breakdown of the analysis
+ process of provided documents. This handler expects a (single)
+ content stream with the following format:
+
+ <docs>
+ <doc>
+ <field name="id">1</field>
+ <field name="name">The Name</field>
+ <field name="text">The Text Value</field>
+ </doc>
+ <doc>...</doc>
+ <doc>...</doc>
+ ...
+ </docs>
+
+ Note: Each document must contain a field which serves as the
+ unique key. This key is used in the returned response to associate
+ an analysis breakdown to the analyzed document.
+
+ Like the FieldAnalysisRequestHandler, this handler also supports
+ query analysis by sending either an "analysis.query" or "q"
+ request parameter that holds the query text to be analyzed. It
+ also supports the "analysis.showmatch" parameter which when set to
+ true, all field tokens that match the query tokens will be marked
+ as a "match".
+ -->
+ <requestHandler name="/analysis/document"
+ class="solr.DocumentAnalysisRequestHandler"
+ startup="lazy" />
+
+ <!-- Admin Handlers
+
+ Admin Handlers - This will register all the standard admin
+ RequestHandlers.
+ -->
+ <requestHandler name="/admin/"
+ class="solr.admin.AdminHandlers" />
+ <!-- This single handler is equivalent to the following... -->
+ <!--
+ <requestHandler name="/admin/luke" class="solr.admin.LukeRequestHandler" />
+ <requestHandler name="/admin/system" class="solr.admin.SystemInfoHandler" />
+ <requestHandler name="/admin/plugins" class="solr.admin.PluginInfoHandler" />
+ <requestHandler name="/admin/threads" class="solr.admin.ThreadDumpHandler" />
+ <requestHandler name="/admin/properties" class="solr.admin.PropertiesRequestHandler" />
+ <requestHandler name="/admin/file" class="solr.admin.ShowFileRequestHandler" >
+ -->
+ <!-- If you wish to hide files under ${solr.home}/conf, explicitly
+ register the ShowFileRequestHandler using:
+ -->
+ <!--
+ <requestHandler name="/admin/file"
+ class="solr.admin.ShowFileRequestHandler" >
+ <lst name="invariants">
+ <str name="hidden">synonyms.txt</str>
+ <str name="hidden">anotherfile.txt</str>
+ </lst>
+ </requestHandler>
+ -->
+
+ <!-- ping/healthcheck -->
+ <requestHandler name="/admin/ping" class="solr.PingRequestHandler">
+ <lst name="invariants">
+ <str name="q">solrpingquery</str>
+ </lst>
+ <lst name="defaults">
+ <str name="echoParams">all</str>
+ </lst>
+ <!-- An optional feature of the PingRequestHandler is to configure the
+ handler with a "healthcheckFile" which can be used to enable/disable
+ the PingRequestHandler.
+ relative paths are resolved against the data dir
+ -->
+ <!-- <str name="healthcheckFile">server-enabled.txt</str> -->
+ </requestHandler>
+
+ <!-- Echo the request contents back to the client -->
+ <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" >
+ <lst name="defaults">
+ <str name="echoParams">explicit</str>
+ <str name="echoHandler">true</str>
+ </lst>
+ </requestHandler>
+
+ <!-- Solr Replication
+
+ The SolrReplicationHandler supports replicating indexes from a
+ "master" used for indexing and "slaves" used for queries.
+
+ http://wiki.apache.org/solr/SolrReplication
+
+ In the example below, remove the <lst name="master"> section if
+ this is just a slave and remove the <lst name="slave"> section
+ if this is just a master.
+ -->
+ <!--
+ <requestHandler name="/replication" class="solr.ReplicationHandler" >
+ <lst name="master">
+ <str name="replicateAfter">commit</str>
+ <str name="replicateAfter">startup</str>
+ <str name="confFiles">schema.xml,stopwords.txt</str>
+ </lst>
+ <lst name="slave">
+ <str name="masterUrl">http://localhost:8983/solr</str>
+ <str name="pollInterval">00:00:60</str>
+ </lst>
+ </requestHandler>
+ -->
+
+ <!-- Solr Replication for SolrCloud Recovery
+
+ This is the config need for SolrCloud's recovery replication.
+ -->
+ <requestHandler name="/replication" class="solr.ReplicationHandler" startup="lazy" />
+
+
+ <!-- Search Components
+
+ Search components are registered to SolrCore and used by
+ instances of SearchHandler (which can access them by name)
+
+ By default, the following components are available:
+
+ <searchComponent name="query" class="solr.QueryComponent" />
+ <searchComponent name="facet" class="solr.FacetComponent" />
+ <searchComponent name="mlt" class="solr.MoreLikeThisComponent" />
+ <searchComponent name="highlight" class="solr.HighlightComponent" />
+ <searchComponent name="stats" class="solr.StatsComponent" />
+ <searchComponent name="debug" class="solr.DebugComponent" />
+
+ Default configuration in a requestHandler would look like:
+
+ <arr name="components">
+ <str>query</str>
+ <str>facet</str>
+ <str>mlt</str>
+ <str>highlight</str>
+ <str>stats</str>
+ <str>debug</str>
+ </arr>
+
+ If you register a searchComponent to one of the standard names,
+ that will be used instead of the default.
+
+ To insert components before or after the 'standard' components, use:
+
+ <arr name="first-components">
+ <str>myFirstComponentName</str>
+ </arr>
+
+ <arr name="last-components">
+ <str>myLastComponentName</str>
+ </arr>
+
+ NOTE: The component registered with the name "debug" will
+ always be executed after the "last-components"
+
+ -->
+
+ <!-- Spell Check
+
+ The spell check component can return a list of alternative spelling
+ suggestions.
+
+ http://wiki.apache.org/solr/SpellCheckComponent
+ -->
+ <searchComponent name="spellcheck" class="solr.SpellCheckComponent">
+
+ <str name="queryAnalyzerFieldType">textSpell</str>
+
+ <!-- Multiple "Spell Checkers" can be declared and used by this
+ component
+ -->
+
+ <!-- a spellchecker built from a field of the main index -->
+ <lst name="spellchecker">
+ <str name="name">default</str>
+ <str name="field">name</str>
+ <str name="classname">solr.DirectSolrSpellChecker</str>
+ <!-- the spellcheck distance measure used, the default is the internal levenshtein -->
+ <str name="distanceMeasure">internal</str>
+ <!-- minimum accuracy needed to be considered a valid spellcheck suggestion -->
+ <float name="accuracy">0.5</float>
+ <!-- the maximum #edits we consider when enumerating terms: can be 1 or 2 -->
+ <int name="maxEdits">2</int>
+ <!-- the minimum shared prefix when enumerating terms -->
+ <int name="minPrefix">1</int>
+ <!-- maximum number of inspections per result. -->
+ <int name="maxInspections">5</int>
+ <!-- minimum length of a query term to be considered for correction -->
+ <int name="minQueryLength">4</int>
+ <!-- maximum threshold of documents a query term can appear to be considered for correction -->
+ <float name="maxQueryFrequency">0.01</float>
+ <!-- uncomment this to require suggestions to occur in 1% of the documents
+ <float name="thresholdTokenFrequency">.01</float>
+ -->
+ </lst>
+
+ <!-- a spellchecker that can break or combine words. See "/spell" handler below for usage -->
+ <lst name="spellchecker">
+ <str name="name">wordbreak</str>
+ <str name="classname">solr.WordBreakSolrSpellChecker</str>
+ <str name="field">name</str>
+ <str name="combineWords">true</str>
+ <str name="breakWords">true</str>
+ <int name="maxChanges">10</int>
+ </lst>
+
+ <!-- a spellchecker that uses a different distance measure -->
+ <!--
+ <lst name="spellchecker">
+ <str name="name">jarowinkler</str>
+ <str name="field">spell</str>
+ <str name="classname">solr.DirectSolrSpellChecker</str>
+ <str name="distanceMeasure">
+ org.apache.lucene.search.spell.JaroWinklerDistance
+ </str>
+ </lst>
+ -->
+
+ <!-- a spellchecker that use an alternate comparator
+
+ comparatorClass be one of:
+ 1. score (default)
+ 2. freq (Frequency first, then score)
+ 3. A fully qualified class name
+ -->
+ <!--
+ <lst name="spellchecker">
+ <str name="name">freq</str>
+ <str name="field">lowerfilt</str>
+ <str name="classname">solr.DirectSolrSpellChecker</str>
+ <str name="comparatorClass">freq</str>
+ -->
+
+ <!-- A spellchecker that reads the list of words from a file -->
+ <!--
+ <lst name="spellchecker">
+ <str name="classname">solr.FileBasedSpellChecker</str>
+ <str name="name">file</str>
+ <str name="sourceLocation">spellings.txt</str>
+ <str name="characterEncoding">UTF-8</str>
+ <str name="spellcheckIndexDir">spellcheckerFile</str>
+ </lst>
+ -->
+ </searchComponent>
+
+ <!-- A request handler for demonstrating the spellcheck component.
+
+ NOTE: This is purely as an example. The whole purpose of the
+ SpellCheckComponent is to hook it into the request handler that
+ handles your normal user queries so that a separate request is
+ not needed to get suggestions.
+
+ IN OTHER WORDS, THERE IS REALLY GOOD CHANCE THE SETUP BELOW IS
+ NOT WHAT YOU WANT FOR YOUR PRODUCTION SYSTEM!
+
+ See http://wiki.apache.org/solr/SpellCheckComponent for details
+ on the request parameters.
+ -->
+ <requestHandler name="/spell" class="solr.SearchHandler" startup="lazy">
+ <lst name="defaults">
+ <str name="df">text</str>
+ <!-- Solr will use suggestions from both the 'default' spellchecker
+ and from the 'wordbreak' spellchecker and combine them.
+ collations (re-written queries) can include a combination of
+ corrections from both spellcheckers -->
+ <str name="spellcheck.dictionary">default</str>
+ <str name="spellcheck.dictionary">wordbreak</str>
+ <str name="spellcheck">on</str>
+ <str name="spellcheck.extendedResults">true</str>
+ <str name="spellcheck.count">10</str>
+ <str name="spellcheck.alternativeTermCount">5</str>
+ <str name="spellcheck.maxResultsForSuggest">5</str>
+ <str name="spellcheck.collate">true</str>
+ <str name="spellcheck.collateExtendedResults">true</str>
+ <str name="spellcheck.maxCollationTries">10</str>
+ <str name="spellcheck.maxCollations">5</str>
+ </lst>
+ <arr name="last-components">
+ <str>spellcheck</str>
+ </arr>
+ </requestHandler>
+
+ <!-- Term Vector Component
+
+ http://wiki.apache.org/solr/TermVectorComponent
+ -->
+ <searchComponent name="tvComponent" class="solr.TermVectorComponent"/>
+
+ <!-- A request handler for demonstrating the term vector component
+
+ This is purely as an example.
+
+ In reality you will likely want to add the component to your
+ already specified request handlers.
+ -->
+ <requestHandler name="/tvrh" class="solr.SearchHandler" startup="lazy">
+ <lst name="defaults">
+ <str name="df">text</str>
+ <bool name="tv">true</bool>
+ </lst>
+ <arr name="last-components">
+ <str>tvComponent</str>
+ </arr>
+ </requestHandler>
+
+ <!-- Clustering Component
+
+ http://wiki.apache.org/solr/ClusteringComponent
+
+ You'll need to set the solr.cluster.enabled system property
+ when running solr to run with clustering enabled:
+
+ java -Dsolr.clustering.enabled=true -jar start.jar
+
+ -->
+ <searchComponent name="clustering"
+ enable="${solr.clustering.enabled:false}"
+ class="solr.clustering.ClusteringComponent" >
+ <!-- Declare an engine -->
+ <lst name="engine">
+ <!-- The name, only one can be named "default" -->
+ <str name="name">default</str>
+
+ <!-- Class name of Carrot2 clustering algorithm.
+
+ Currently available algorithms are:
+
+ * org.carrot2.clustering.lingo.LingoClusteringAlgorithm
+ * org.carrot2.clustering.stc.STCClusteringAlgorithm
+ * org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
+
+ See http://project.carrot2.org/algorithms.html for the
+ algorithm's characteristics.
+ -->
+ <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
+
+ <!-- Overriding values for Carrot2 default algorithm attributes.
+
+ For a description of all available attributes, see:
+ http://download.carrot2.org/stable/manual/#chapter.components.
+ Use attribute key as name attribute of str elements
+ below. These can be further overridden for individual
+ requests by specifying attribute key as request parameter
+ name and attribute value as parameter value.
+ -->
+ <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
+
+ <!-- Location of Carrot2 lexical resources.
+
+ A directory from which to load Carrot2-specific stop words
+ and stop labels. Absolute or relative to Solr config directory.
+ If a specific resource (e.g. stopwords.en) is present in the
+ specified dir, it will completely override the corresponding
+ default one that ships with Carrot2.
+
+ For an overview of Carrot2 lexical resources, see:
+ http://download.carrot2.org/head/manual/#chapter.lexical-resources
+ -->
+ <str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
+
+ <!-- The language to assume for the documents.
+
+ For a list of allowed values, see:
+ http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
+ -->
+ <str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
+ </lst>
+ <lst name="engine">
+ <str name="name">stc</str>
+ <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
+ </lst>
+ </searchComponent>
+
+ <!-- A request handler for demonstrating the clustering component
+
+ This is purely as an example.
+
+ In reality you will likely want to add the component to your
+ already specified request handlers.
+ -->
+ <requestHandler name="/clustering"
+ startup="lazy"
+ enable="${solr.clustering.enabled:false}"
+ class="solr.SearchHandler">
+ <lst name="defaults">
+ <bool name="clustering">true</bool>
+ <str name="clustering.engine">default</str>
+ <bool name="clustering.results">true</bool>
+ <!-- The title field -->
+ <str name="carrot.title">name</str>
+ <str name="carrot.url">id</str>
+ <!-- The field to cluster on -->
+ <str name="carrot.snippet">features</str>
+ <!-- produce summaries -->
+ <bool name="carrot.produceSummary">true</bool>
+ <!-- the maximum number of labels per cluster -->
+ <!--<int name="carrot.numDescriptions">5</int>-->
+ <!-- produce sub clusters -->
+ <bool name="carrot.outputSubClusters">false</bool>
+
+ <str name="defType">edismax</str>
+ <str name="qf">
+ text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
+ </str>
+ <str name="q.alt">*:*</str>
+ <str name="rows">10</str>
+ <str name="fl">*,score</str>
+ </lst>
+ <arr name="last-components">
+ <str>clustering</str>
+ </arr>
+ </requestHandler>
+
+ <!-- Terms Component
+
+ http://wiki.apache.org/solr/TermsComponent
+
+ A component to return terms and document frequency of those
+ terms
+ -->
+ <searchComponent name="terms" class="solr.TermsComponent"/>
+
+ <!-- A request handler for demonstrating the terms component -->
+ <requestHandler name="/terms" class="solr.SearchHandler" startup="lazy">
+ <lst name="defaults">
+ <bool name="terms">true</bool>
+ </lst>
+ <arr name="components">
+ <str>terms</str>
+ </arr>
+ </requestHandler>
+
+
+ <!-- Query Elevation Component
+
+ http://wiki.apache.org/solr/QueryElevationComponent
+
+ a search component that enables you to configure the top
+ results for a given query regardless of the normal lucene
+ scoring.
+ -->
+<!-- <searchComponent name="elevator" class="solr.QueryElevationComponent" >-->
+ <!-- pick a fieldType to analyze queries -->
+ <!--<str name="queryFieldType">string</str>
+ <str name="config-file">elevate.xml</str>
+ </searchComponent>-->
+
+ <!-- A request handler for demonstrating the elevator component -->
+<!-- <requestHandler name="/elevate" class="solr.SearchHandler" startup="lazy">
+ <lst name="defaults">
+ <str name="echoParams">explicit</str>
+ <str name="df">text</str>
+ </lst>
+ <arr name="last-components">
+ <str>elevator</str>
+ </arr>
+ </requestHandler>-->
+
+ <!-- Highlighting Component
+
+ http://wiki.apache.org/solr/HighlightingParameters
+ -->
+ <searchComponent class="solr.HighlightComponent" name="highlight">
+ <highlighting>
+ <!-- Configure the standard fragmenter -->
+ <!-- This could most likely be commented out in the "default" case -->
+ <fragmenter name="gap"
+ default="true"
+ class="solr.highlight.GapFragmenter">
+ <lst name="defaults">
+ <int name="hl.fragsize">100</int>
+ </lst>
+ </fragmenter>
+
+ <!-- A regular-expression-based fragmenter
+ (for sentence extraction)
+ -->
+ <fragmenter name="regex"
+ class="solr.highlight.RegexFragmenter">
+ <lst name="defaults">
+ <!-- slightly smaller fragsizes work better because of slop -->
+ <int name="hl.fragsize">70</int>
+ <!-- allow 50% slop on fragment sizes -->
+ <float name="hl.regex.slop">0.5</float>
+ <!-- a basic sentence pattern -->
+ <str name="hl.regex.pattern">[-\w ,/\n\"']{20,200}</str>
+ </lst>
+ </fragmenter>
+
+ <!-- Configure the standard formatter -->
+ <formatter name="html"
+ default="true"
+ class="solr.highlight.HtmlFormatter">
+ <lst name="defaults">
+ <str name="hl.simple.pre"><![CDATA[<em>]]></str>
+ <str name="hl.simple.post"><![CDATA[</em>]]></str>
+ </lst>
+ </formatter>
+
+ <!-- Configure the standard encoder -->
+ <encoder name="html"
+ class="solr.highlight.HtmlEncoder" />
+
+ <!-- Configure the standard fragListBuilder -->
+ <fragListBuilder name="simple"
+ class="solr.highlight.SimpleFragListBuilder"/>
+
+ <!-- Configure the single fragListBuilder -->
+ <fragListBuilder name="single"
+ class="solr.highlight.SingleFragListBuilder"/>
+
+ <!-- Configure the weighted fragListBuilder -->
+ <fragListBuilder name="weighted"
+ default="true"
+ class="solr.highlight.WeightedFragListBuilder"/>
+
+ <!-- default tag FragmentsBuilder -->
+ <fragmentsBuilder name="default"
+ default="true"
+ class="solr.highlight.ScoreOrderFragmentsBuilder">
+ <!--
+ <lst name="defaults">
+ <str name="hl.multiValuedSeparatorChar">/</str>
+ </lst>
+ -->
+ </fragmentsBuilder>
+
+ <!-- multi-colored tag FragmentsBuilder -->
+ <fragmentsBuilder name="colored"
+ class="solr.highlight.ScoreOrderFragmentsBuilder">
+ <lst name="defaults">
+ <str name="hl.tag.pre"><![CDATA[
+ <b style="background:yellow">,<b style="background:lawgreen">,
+ <b style="background:aquamarine">,<b style="background:magenta">,
+ <b style="background:palegreen">,<b style="background:coral">,
+ <b style="background:wheat">,<b style="background:khaki">,
+ <b style="background:lime">,<b style="background:deepskyblue">]]></str>
+ <str name="hl.tag.post"><![CDATA[</b>]]></str>
+ </lst>
+ </fragmentsBuilder>
+
+ <boundaryScanner name="default"
+ default="true"
+ class="solr.highlight.SimpleBoundaryScanner">
+ <lst name="defaults">
+ <str name="hl.bs.maxScan">10</str>
+ <str name="hl.bs.chars">.,!? 	 </str>
+ </lst>
+ </boundaryScanner>
+
+ <boundaryScanner name="breakIterator"
+ class="solr.highlight.BreakIteratorBoundaryScanner">
+ <lst name="defaults">
+ <!-- type should be one of CHARACTER, WORD(default), LINE and SENTENCE -->
+ <str name="hl.bs.type">WORD</str>
+ <!-- language and country are used when constructing Locale object. -->
+ <!-- And the Locale object will be used when getting instance of BreakIterator -->
+ <str name="hl.bs.language">en</str>
+ <str name="hl.bs.country">US</str>
+ </lst>
+ </boundaryScanner>
+ </highlighting>
+ </searchComponent>
+
+ <!-- Update Processors
+
+ Chains of Update Processor Factories for dealing with Update
+ Requests can be declared, and then used by name in Update
+ Request Processors
+
+ http://wiki.apache.org/solr/UpdateRequestProcessor
+
+ -->
+ <!-- Deduplication
+
+ An example dedup update processor that creates the "id" field
+ on the fly based on the hash code of some other fields. This
+ example has overwriteDupes set to false since we are using the
+ id field as the signatureField and Solr will maintain
+ uniqueness based on that anyway.
+
+ -->
+ <!--
+ <updateRequestProcessorChain name="dedupe">
+ <processor class="solr.processor.SignatureUpdateProcessorFactory">
+ <bool name="enabled">true</bool>
+ <str name="signatureField">id</str>
+ <bool name="overwriteDupes">false</bool>
+ <str name="fields">name,features,cat</str>
+ <str name="signatureClass">solr.processor.Lookup3Signature</str>
+ </processor>
+ <processor class="solr.LogUpdateProcessorFactory" />
+ <processor class="solr.RunUpdateProcessorFactory" />
+ </updateRequestProcessorChain>
+ -->
+
+ <!-- Language identification
+
+ This example update chain identifies the language of the incoming
+ documents using the langid contrib. The detected language is
+ written to field language_s. No field name mapping is done.
+ The fields used for detection are text, title, subject and description,
+ making this example suitable for detecting languages form full-text
+ rich documents injected via ExtractingRequestHandler.
+ See more about langId at http://wiki.apache.org/solr/LanguageDetection
+ -->
+ <!--
+ <updateRequestProcessorChain name="langid">
+ <processor class="org.apache.solr.update.processor.TikaLanguageIdentifierUpdateProcessorFactory">
+ <str name="langid.fl">text,title,subject,description</str>
+ <str name="langid.langField">language_s</str>
+ <str name="langid.fallback">en</str>
+ </processor>
+ <processor class="solr.LogUpdateProcessorFactory" />
+ <processor class="solr.RunUpdateProcessorFactory" />
+ </updateRequestProcessorChain>
+ -->
+
+ <!-- Script update processor
+
+ This example hooks in an update processor implemented using JavaScript.
+
+ See more about the script update processor at http://wiki.apache.org/solr/ScriptUpdateProcessor
+ -->
+ <!--
+ <updateRequestProcessorChain name="script">
+ <processor class="solr.StatelessScriptUpdateProcessorFactory">
+ <str name="script">update-script.js</str>
+ <lst name="params">
+ <str name="config_param">example config parameter</str>
+ </lst>
+ </processor>
+ <processor class="solr.RunUpdateProcessorFactory" />
+ </updateRequestProcessorChain>
+ -->
+
+ <!-- Response Writers
+
+ http://wiki.apache.org/solr/QueryResponseWriter
+
+ Request responses will be written using the writer specified by
+ the 'wt' request parameter matching the name of a registered
+ writer.
+
+ The "default" writer is the default and will be used if 'wt' is
+ not specified in the request.
+ -->
+ <!-- The following response writers are implicitly configured unless
+ overridden...
+ -->
+ <!--
+ <queryResponseWriter name="xml"
+ default="true"
+ class="solr.XMLResponseWriter" />
+ <queryResponseWriter name="json" class="solr.JSONResponseWriter"/>
+ <queryResponseWriter name="python" class="solr.PythonResponseWriter"/>
+ <queryResponseWriter name="ruby" class="solr.RubyResponseWriter"/>
+ <queryResponseWriter name="php" class="solr.PHPResponseWriter"/>
+ <queryResponseWriter name="phps" class="solr.PHPSerializedResponseWriter"/>
+ <queryResponseWriter name="csv" class="solr.CSVResponseWriter"/>
+ -->
+
+ <queryResponseWriter name="json" class="solr.JSONResponseWriter">
+ <!-- For the purposes of the tutorial, JSON responses are written as
+ plain text so that they are easy to read in *any* browser.
+ If you expect a MIME type of "application/json" just remove this override.
+ -->
+ <str name="content-type">text/plain; charset=UTF-8</str>
+ </queryResponseWriter>
+
+ <!--
+ Custom response writers can be declared as needed...
+ -->
+ <queryResponseWriter name="velocity" class="solr.VelocityResponseWriter" startup="lazy"/>
+
+
+ <!-- XSLT response writer transforms the XML output by any xslt file found
+ in Solr's conf/xslt directory. Changes to xslt files are checked for
+ every xsltCacheLifetimeSeconds.
+ -->
+ <queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
+ <int name="xsltCacheLifetimeSeconds">5</int>
+ </queryResponseWriter>
+
+ <!-- Query Parsers
+
+ http://wiki.apache.org/solr/SolrQuerySyntax
+
+ Multiple QParserPlugins can be registered by name, and then
+ used in either the "defType" param for the QueryComponent (used
+ by SearchHandler) or in LocalParams
+ -->
+ <!-- example of registering a query parser -->
+ <!--
+ <queryParser name="myparser" class="com.mycompany.MyQParserPlugin"/>
+ -->
+
+ <!-- Function Parsers
+
+ http://wiki.apache.org/solr/FunctionQuery
+
+ Multiple ValueSourceParsers can be registered by name, and then
+ used as function names when using the "func" QParser.
+ -->
+ <!-- example of registering a custom function parser -->
+ <!--
+ <valueSourceParser name="myfunc"
+ class="com.mycompany.MyValueSourceParser" />
+ -->
+
+
+ <!-- Document Transformers
+ http://wiki.apache.org/solr/DocTransformers
+ -->
+ <!--
+ Could be something like:
+ <transformer name="db" class="com.mycompany.LoadFromDatabaseTransformer" >
+ <int name="connection">jdbc://....</int>
+ </transformer>
+
+ To add a constant value to all docs, use:
+ <transformer name="mytrans2" class="org.apache.solr.response.transform.ValueAugmenterFactory" >
+ <int name="value">5</int>
+ </transformer>
+
+ If you want the user to still be able to change it with _value:something_ use this:
+ <transformer name="mytrans3" class="org.apache.solr.response.transform.ValueAugmenterFactory" >
+ <double name="defaultValue">5</double>
+ </transformer>
+
+ If you are using the QueryElevationComponent, you may wish to mark documents that get boosted. The
+ EditorialMarkerFactory will do exactly that:
+ <transformer name="qecBooster" class="org.apache.solr.response.transform.EditorialMarkerFactory" />
+ -->
+
+
+ <!-- Legacy config for the admin interface -->
+ <admin>
+ <defaultQuery>*:*</defaultQuery>
+ </admin>
+
+</config>