From: Marcin Koziej Date: Mon, 10 Sep 2012 13:03:33 +0000 (+0200) Subject: Merge branch 'master' into sunburnt X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/ac6111b68183ba7da48252eb0f7389b24ede20a9?hp=6835539fb69c791e27d536c8136a7ee199237754 Merge branch 'master' into sunburnt --- diff --git a/apps/catalogue/management/commands/importbooks.py b/apps/catalogue/management/commands/importbooks.py index 637e2148f..8f181cf98 100644 --- a/apps/catalogue/management/commands/importbooks.py +++ b/apps/catalogue/management/commands/importbooks.py @@ -44,8 +44,6 @@ class Command(BaseCommand): file_base, ext = os.path.splitext(file_path) book = Book.from_xml_file(file_path, overwrite=options.get('force'), dont_build=dont_build, - search_index=options.get('search_index'), - search_index_reuse=True, search_index_tags=False) for ebook_format in Book.ebook_formats: if os.path.isfile(file_base + '.' + ebook_format): @@ -80,13 +78,15 @@ class Command(BaseCommand): time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(wait_until)), wait_until - time.time()) + index = None if options.get('search_index') and not settings.NO_SEARCH_INDEX: index = Index() - index.open() try: index.index_tags() - finally: - index.close() + index.index.commit() + except Exception, e: + index.index.rollback() + raise e # Start transaction management. transaction.commit_unless_managed() @@ -95,7 +95,7 @@ class Command(BaseCommand): files_imported = 0 files_skipped = 0 - + for dir_name in directories: if not os.path.isdir(dir_name): print self.style.ERROR("%s: Not a directory. Skipping." % dir_name) @@ -126,7 +126,7 @@ class Command(BaseCommand): self.import_book(file_path, options) files_imported += 1 transaction.commit() - + except (Book.AlreadyExists, Picture.AlreadyExists): print self.style.ERROR('%s: Book or Picture already imported. Skipping. To overwrite use --force.' % file_path) @@ -158,4 +158,3 @@ class Command(BaseCommand): transaction.commit() transaction.leave_transaction_management() - diff --git a/apps/catalogue/models/book.py b/apps/catalogue/models/book.py index 6a31f4bff..5bc1e1021 100644 --- a/apps/catalogue/models/book.py +++ b/apps/catalogue/models/book.py @@ -189,20 +189,20 @@ class Book(models.Model): paths = map(lambda bm: (None, bm.file.path), bm) return create_zip(paths, "%s_%s" % (self.slug, format_)) - def search_index(self, book_info=None, reuse_index=False, index_tags=True): + def search_index(self, book_info=None, index=None, index_tags=True, commit=True): import search - if reuse_index: - idx = search.ReusableIndex() - else: - idx = search.Index() - - idx.open() + if index is None: + index = search.Index() try: - idx.index_book(self, book_info) + index.index_book(self, book_info) if index_tags: idx.index_tags() - finally: - idx.close() + if commit: + index.index.commit() + except Exception, e: + index.index.rollback() + raise e + @classmethod def from_xml_file(cls, xml_file, **kwargs): @@ -223,7 +223,7 @@ class Book(models.Model): @classmethod def from_text_and_meta(cls, raw_file, book_info, overwrite=False, dont_build=None, search_index=True, - search_index_tags=True, search_index_reuse=False): + search_index_tags=True): if dont_build is None: dont_build = set() dont_build = set.union(set(dont_build), set(app_settings.DONT_BUILD)) @@ -310,8 +310,7 @@ class Book(models.Model): getattr(book, '%s_file' % format_).build_delay() if not settings.NO_SEARCH_INDEX and search_index: - book.search_index(index_tags=search_index_tags, reuse_index=search_index_reuse) - #index_book.delay(book.id, book_info) + tasks.index_book.delay(book.id, book_info=book_info, index_tags=search_index_tags) for child in notify_cover_changed: child.parent_cover_changed() diff --git a/apps/catalogue/tasks.py b/apps/catalogue/tasks.py index 03f4a3973..e7f7b2947 100644 --- a/apps/catalogue/tasks.py +++ b/apps/catalogue/tasks.py @@ -24,10 +24,10 @@ def fix_tree_tags(book): @task -def index_book(book_id, book_info=None): +def index_book(book_id, book_info=None, **kwargs): from catalogue.models import Book try: - return Book.objects.get(id=book_id).search_index(book_info) + return Book.objects.get(id=book_id).search_index(book_info, **kwargs) except Exception, e: print "Exception during index: %s" % e print_exc() diff --git a/apps/catalogue/templates/catalogue/book_searched.html b/apps/catalogue/templates/catalogue/book_searched.html index 1a345ed27..4ac66fb19 100644 --- a/apps/catalogue/templates/catalogue/book_searched.html +++ b/apps/catalogue/templates/catalogue/book_searched.html @@ -7,15 +7,21 @@ {% block right-column %}
{% for hit in hits %} - {% if hit.snippets %} - + {% if hit.snippet %} + {% else %} {% if hit.fragment %} {% endif %} {% endif %} diff --git a/apps/opds/views.py b/apps/opds/views.py index 8fe0b8f1b..297c2120a 100644 --- a/apps/opds/views.py +++ b/apps/opds/views.py @@ -16,7 +16,7 @@ from django.contrib.sites.models import Site from basicauth import logged_in_or_basicauth, factory_decorator from catalogue.models import Book, Tag -from search.views import get_search, SearchResult, JVM +from search.views import Search, SearchResult from lucene import Term, QueryWrapperFilter, TermQuery import logging @@ -92,7 +92,7 @@ class OPDSFeed(Atom1Feed): {u"href": reverse("opds_authors"), u"rel": u"start", u"type": u"application/atom+xml"}) - handler.addQuickElement(u"link", None, + handler.addQuickElement(u"link", None, {u"href": full_url(os.path.join(settings.STATIC_URL, "opensearch.xml")), u"rel": u"search", u"type": u"application/opensearchdescription+xml"}) @@ -329,7 +329,7 @@ class SearchFeed(AcquisitionFeed): title = u"Wyniki wyszukiwania" INLINE_QUERY_RE = re.compile(r"(author:(?P[^ ]+)|title:(?P[^ ]+)|categories:(?P<categories>[^ ]+)|description:(?P<description>[^ ]+))") - + def get_object(self, request): """ For OPDS 1.1 We should handle a query for search terms @@ -337,7 +337,7 @@ class SearchFeed(AcquisitionFeed): OpenSearch defines fields: atom:author, atom:contributor (treated as translator), atom:title. Inline query provides author, title, categories (treated as book tags), description (treated as content search terms). - + if search terms are provided, we shall search for books according to Hint information (from author & contributror & title). @@ -345,10 +345,9 @@ class SearchFeed(AcquisitionFeed): (perhaps for is_book=True) """ - JVM.attachCurrentThread() query = request.GET.get('q', '') - + inline_criteria = re.findall(self.INLINE_QUERY_RE, query) if inline_criteria: def get_criteria(criteria, name, position): @@ -374,13 +373,13 @@ class SearchFeed(AcquisitionFeed): translator = request.GET.get('translator', '') # Our client didn't handle the opds placeholders - if author == '{atom:author}': author = '' + if author == '{atom:author}': author = '' if title == '{atom:title}': title = '' if translator == '{atom:contributor}': translator = '' categories = None fuzzy = False - srch = get_search() + srch = Search() hint = srch.hint() # Scenario 1: full search terms provided. @@ -389,12 +388,12 @@ class SearchFeed(AcquisitionFeed): filters = [] if author: - log.info( "narrow to author %s" % author) - hint.tags(srch.search_tags(srch.make_phrase(srch.get_tokens(author, field='authors'), field='authors'), + log.info("narrow to author %s" % author) + hint.tags(srch.search_tags(srch.make_phrase(srch.get_tokens(author, field='authors'), field='authors'), filt=srch.term_filter(Term('tag_category', 'author')))) if translator: - log.info( "filter by translator %s" % translator) + log.info("filter by translator %s" % translator) filters.append(QueryWrapperFilter( srch.make_phrase(srch.get_tokens(translator, field='translators'), field='translators'))) @@ -406,13 +405,13 @@ class SearchFeed(AcquisitionFeed): flt = srch.chain_filters(filters) if title: - log.info( "hint by book title %s" % title) + log.info("hint by book title %s" % title) q = srch.make_phrase(srch.get_tokens(title, field='title'), field='title') hint.books(*srch.search_books(q, filt=flt)) toks = srch.get_tokens(query) log.info("tokens for query: %s" % toks) - + results = SearchResult.aggregate(srch.search_perfect_book(toks, fuzzy=fuzzy, hint=hint), srch.search_perfect_parts(toks, fuzzy=fuzzy, hint=hint), srch.search_everywhere(toks, fuzzy=fuzzy, hint=hint)) diff --git a/apps/search/__init__.py b/apps/search/__init__.py index 1451fa214..279d281e3 100644 --- a/apps/search/__init__.py +++ b/apps/search/__init__.py @@ -1,3 +1,4 @@ -import lucene +#import lucene -from index import Index, Search, ReusableIndex, SearchResult, JVM, IndexChecker, IndexStore +#from index import Index, Search, ReusableIndex, SearchResult, JVM, IndexChecker, IndexStore +from index import Index, Search, SearchResult diff --git a/apps/search/custom.py b/apps/search/custom.py new file mode 100644 index 000000000..788b6c409 --- /dev/null +++ b/apps/search/custom.py @@ -0,0 +1,160 @@ + +from sunburnt import sunburnt +from lxml import etree +import urllib +import warnings +from sunburnt import search +import copy + + +class TermVectorOptions(search.Options): + def __init__(self, schema, original=None): + self.schema = schema + if original is None: + self.fields = set() + self.positions = False + else: + self.fields = copy.copy(original.fields) + self.positions = copy.copy(original.positions) + + def update(self, positions=False, fields=None): + if fields is None: + fields = [] + if isinstance(fields, basestring): + fields = [fields] + self.schema.check_fields(fields, {"stored": True}) + self.fields.update(fields) + self.positions = positions + + def options(self): + opts = {} + if self.positions or self.fields: + opts['tv'] = 'true' + if self.positions: + opts['tv.positions'] = 'true' + if self.fields: + opts['tv.fl'] = ','.join(sorted(self.fields)) + return opts + + +class CustomSolrConnection(sunburnt.SolrConnection): + def __init__(self, *args, **kw): + super(CustomSolrConnection, self).__init__(*args, **kw) + self.analysis_url = self.url + "analysis/field/" + + def analyze(self, params): + qs = urllib.urlencode(params) + url = "%s?%s" % (self.analysis_url, qs) + if len(url) > self.max_length_get_url: + warnings.warn("Long query URL encountered - POSTing instead of " + "GETting. This query will not be cached at the HTTP layer") + url = self.analysis_url + kwargs = dict( + method="POST", + body=qs, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + ) + else: + kwargs = dict(method="GET") + r, c = self.request(url, **kwargs) + if r.status != 200: + raise sunburnt.SolrError(r, c) + return c + + +# monkey patching sunburnt SolrSearch +search.SolrSearch.option_modules += ('term_vectorer',) + + +def __term_vector(self, positions=False, fields=None): + newself = self.clone() + newself.term_vectorer.update(positions, fields) + return newself +setattr(search.SolrSearch, 'term_vector', __term_vector) + + +def __patched__init_common_modules(self): + __original__init_common_modules(self) + self.term_vectorer = TermVectorOptions(self.schema) +__original__init_common_modules = search.SolrSearch._init_common_modules +setattr(search.SolrSearch, '_init_common_modules', __patched__init_common_modules) + + +class CustomSolrInterface(sunburnt.SolrInterface): + # just copied from parent and SolrConnection -> CustomSolrConnection + def __init__(self, url, schemadoc=None, http_connection=None, mode='', retry_timeout=-1, max_length_get_url=sunburnt.MAX_LENGTH_GET_URL): + self.conn = CustomSolrConnection(url, http_connection, retry_timeout, max_length_get_url) + self.schemadoc = schemadoc + if 'w' not in mode: + self.writeable = False + elif 'r' not in mode: + self.readable = False + self.init_schema() + + def _analyze(self, **kwargs): + if not self.readable: + raise TypeError("This Solr instance is only for writing") + args = { + 'analysis_showmatch': True + } + if 'field' in kwargs: args['analysis_fieldname'] = kwargs['field'] + if 'text' in kwargs: args['analysis_fieldvalue'] = kwargs['text'] + if 'q' in kwargs: args['q'] = kwargs['q'] + if 'query' in kwargs: args['q'] = kwargs['q'] + + params = map(lambda (k, v): (k.replace('_', '.'), v), sunburnt.params_from_dict(**args)) + + content = self.conn.analyze(params) + doc = etree.fromstring(content) + return doc + + def highlight(self, **kwargs): + doc = self._analyze(**kwargs) + analyzed = doc.xpath("//lst[@name='index']/arr[last()]/lst[bool/@name='match']") + matches = set() + for wrd in analyzed: + start = int(wrd.xpath("int[@name='start']")[0].text) + end = int(wrd.xpath("int[@name='end']")[0].text) + matches.add((start, end)) + + if matches: + return self.substring(kwargs['text'], matches, + margins=kwargs.get('margins', 30), + mark=kwargs.get('mark', ("<b>", "</b>"))) + else: + return None + + def analyze(self, **kwargs): + doc = self._analyze(**kwargs) + terms = doc.xpath("//lst[@name='index']/arr[last()]/lst/str[1]") + terms = map(lambda n: unicode(n.text), terms) + return terms + + def substring(self, text, matches, margins=30, mark=("<b>", "</b>")): + start = None + end = None + totlen = len(text) + matches_margins = map(lambda (s, e): + ((s, e), + (max(0, s - margins), min(totlen, e + margins))), + matches) + (start, end) = matches_margins[0][1] + matches = [] + for (m, (s, e)) in matches_margins[1:]: + if end < s or start > e: + continue + start = min(start, s) + end = max(end, e) + matches.append(m) + + snip = text[start:end] + matches.sort(lambda a, b: cmp(b[0], a[0])) + + for (s, e) in matches: + off = - start + snip = snip[:e + off] + mark[1] + snip[e + off:] + snip = snip[:s + off] + mark[0] + snip[s + off:] + # maybe break on word boundaries + + return snip + diff --git a/apps/search/index.py b/apps/search/index.py index a0bf71588..557f4045c 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -1,26 +1,7 @@ # -*- coding: utf-8 -*- from django.conf import settings -from django.dispatch import Signal -from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \ - File, Field, Integer, \ - NumericField, Version, Document, JavaError, IndexSearcher, \ - QueryParser, PerFieldAnalyzerWrapper, \ - SimpleAnalyzer, PolishAnalyzer, ArrayList, \ - KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \ - BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \ - HashSet, BooleanClause, Term, CharTermAttribute, \ - PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \ - FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \ - SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \ - BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \ - initVM, CLASSPATH, JArray, JavaError - # KeywordAnalyzer - -# Initialize jvm -JVM = initVM(CLASSPATH, maxheap=settings.JVM_MAXHEAP) - -import sys + import os import re import errno @@ -29,84 +10,18 @@ from librarian.parser import WLDocument from lxml import etree import catalogue.models from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook -from multiprocessing.pool import ThreadPool -from threading import current_thread from itertools import chain -import atexit import traceback import logging log = logging.getLogger('search') +import sunburnt +import custom +import operator -class WLAnalyzer(PerFieldAnalyzerWrapper): - def __init__(self): - polish = PolishAnalyzer(Version.LUCENE_34) - # polish_gap.setPositionIncrementGap(999) - - simple = SimpleAnalyzer(Version.LUCENE_34) - # simple_gap.setPositionIncrementGap(999) - - keyword = KeywordAnalyzer(Version.LUCENE_34) - - # not sure if needed: there's NOT_ANALYZED meaning basically the same - - PerFieldAnalyzerWrapper.__init__(self, polish) - - self.addAnalyzer("tags", simple) - self.addAnalyzer("technical_editors", simple) - self.addAnalyzer("editors", simple) - self.addAnalyzer("url", keyword) - self.addAnalyzer("source_url", keyword) - self.addAnalyzer("source_name", simple) - self.addAnalyzer("publisher", simple) - self.addAnalyzer("authors", simple) - self.addAnalyzer("title", simple) - - self.addAnalyzer("is_book", keyword) - # shouldn't the title have two forms? _pl and simple? - - self.addAnalyzer("themes", simple) - self.addAnalyzer("themes_pl", polish) - - self.addAnalyzer("tag_name", simple) - self.addAnalyzer("tag_name_pl", polish) - - self.addAnalyzer("translators", simple) - - self.addAnalyzer("KEYWORD", keyword) - self.addAnalyzer("SIMPLE", simple) - self.addAnalyzer("POLISH", polish) - - -class IndexStore(object): - """ - Provides access to search index. - - self.store - lucene index directory - """ - def __init__(self): - self.make_index_dir() - self.store = NIOFSDirectory(File(settings.SEARCH_INDEX)) - - def make_index_dir(self): - try: - os.makedirs(settings.SEARCH_INDEX) - except OSError as exc: - if exc.errno == errno.EEXIST: - pass - else: raise - def close(self): - self.store.close() - - -class IndexChecker(IndexStore): - def __init__(self): - IndexStore.__init__(self) - - def check(self): - checker = CheckIndex(self.store) - status = checker.checkIndex() - return status +class SolrIndex(object): + def __init__(self, mode=None): + self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode) class Snippets(object): @@ -191,60 +106,38 @@ class Snippets(object): pass -class BaseIndex(IndexStore): - """ - Base index class. - Provides basic operations on index: opening, closing, optimizing. - """ - def __init__(self, analyzer=None): - super(BaseIndex, self).__init__() - self.index = None - if not analyzer: - analyzer = WLAnalyzer() - self.analyzer = analyzer - - def open(self, timeout=None): - if self.index: - raise Exception("Index is already opened") - conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer) - if timeout: - conf.setWriteLockTimeout(long(timeout)) - self.index = IndexWriter(self.store, conf) - return self.index - - def optimize(self): - self.index.optimize() - - def close(self): - try: - self.index.optimize() - except JavaError, je: - log.error("Error during optimize phase, check index: %s" % je) - - self.index.close() - self.index = None - - index_changed.send_robust(self) - - super(BaseIndex, self).close() - - def __enter__(self): - self.open() - return self - - def __exit__(self, type, value, tb): - self.close() - - -index_changed = Signal() - - -class Index(BaseIndex): +class Index(SolrIndex): """ Class indexing books. """ - def __init__(self, analyzer=None): - super(Index, self).__init__(analyzer) + def __init__(self): + super(Index, self).__init__(mode='rw') + + def delete_query(self, *queries): + """ + index.delete(queries=...) doesn't work, so let's reimplement it + using deletion of list of uids. + """ + uids = set() + for q in queries: + if isinstance(q, sunburnt.search.LuceneQuery): + q = self.index.query(q) + q.field_limiter.update(['uid']) + st = 0 + rows = 100 + while True: + ids = q.paginate(start=st, rows=rows).execute() + if not len(ids): + break + for res in ids: + uids.add(res['uid']) + st += rows + # print "Will delete %s" % ','.join([x for x in uids]) + if uids: + self.index.delete(uids) + return True + else: + return False def index_tags(self, *tags, **kw): """ @@ -255,25 +148,23 @@ class Index(BaseIndex): remove_only = kw.get('remove_only', False) # first, remove tags from index. if tags: - q = BooleanQuery() + tag_qs = [] for tag in tags: - b_id_cat = BooleanQuery() - - q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True) - b_id_cat.add(q_id, BooleanClause.Occur.MUST) + q_id = self.index.Q(tag_id=tag.id) if isinstance(tag, PDCounterAuthor): - q_cat = TermQuery(Term('tag_category', 'pd_author')) + q_cat = self.index.Q(tag_category='pd_author') elif isinstance(tag, PDCounterBook): - q_cat = TermQuery(Term('tag_category', 'pd_book')) + q_cat = self.index.Q(tag_category='pd_book') else: - q_cat = TermQuery(Term('tag_category', tag.category)) - b_id_cat.add(q_cat, BooleanClause.Occur.MUST) + q_cat = self.index.Q(tag_category=tag.category) - q.add(b_id_cat, BooleanClause.Occur.SHOULD) + q_id_cat = self.index.Q(q_id & q_cat) + tag_qs.append(q_id_cat) + self.delete_query(tag_qs) else: # all - q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True) - self.index.deleteDocuments(q) + q = self.index.Q(tag_id__any=True) + self.delete_query(q) if not remove_only: # then add them [all or just one passed] @@ -284,37 +175,43 @@ class Index(BaseIndex): for tag in tags: if isinstance(tag, PDCounterAuthor): - doc = Document() - doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id))) - doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED)) - doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED)) - doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED)) - doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED)) - self.index.addDocument(doc) + doc = { + "tag_id": int(tag.id), + "tag_name": tag.name, + "tag_name_pl": tag.name, + "tag_category": 'pd_author', + "is_pdcounter": True, + "uid": "tag%d_pd_a" % tag.id + } elif isinstance(tag, PDCounterBook): - doc = Document() - doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id))) - doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED)) - doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED)) - doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED)) - doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED)) - self.index.addDocument(doc) + doc = { + "tag_id": int(tag.id), + "tag_name": tag.title, + "tag_name_pl": tag.title, + "tag_category": 'pd_book', + "is_pdcounter": True, + "uid": "tag%d_pd_b" % tag.id + } else: - doc = Document() - doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id))) - doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED)) - doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED)) - doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED)) - self.index.addDocument(doc) + doc = { + "tag_id": int(tag.id), + "tag_name": tag.name, + "tag_name_pl": tag.name, + "tag_category": tag.category, + "is_pdcounter": False, + "uid": "tag%d" % tag.id + } + self.index.add(doc) def create_book_doc(self, book): """ Create a lucene document referring book id. """ - doc = Document() - doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id))) + doc = { + 'book_id': int(book.id), + } if book.parent is not None: - doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id))) + doc["parent_id"] = int(book.parent.id) return doc def remove_book(self, book_or_id, remove_snippets=True): @@ -325,8 +222,7 @@ class Index(BaseIndex): else: book_id = book_or_id - q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True) - self.index.deleteDocuments(q) + self.delete_query(self.index.Q(book_id=book_id)) if remove_snippets: snippets = Snippets(book_id) @@ -348,17 +244,22 @@ class Index(BaseIndex): # let's not index it - it's only used for extracting publish date if 'source_name' in meta_fields: del meta_fields['source_name'] - - for f in meta_fields.values(): - if isinstance(f, list) or isinstance(f, tuple): - for elem in f: - book_doc.add(elem) - else: - book_doc.add(f) - self.index.addDocument(book_doc) + + for n, f in meta_fields.items(): + book_doc[n] = f + + book_doc['uid'] = "book%s" % book_doc['book_id'] + self.index.add(book_doc) del book_doc + book_fields = { + 'title': meta_fields['title'], + 'authors': meta_fields['authors'], + 'published_date': meta_fields['published_date'] + } + if 'translators' in meta_fields: + book_fields['translators'] = meta_fields['translators'] - self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']]) + self.index_content(book, book_fields=book_fields) master_tags = [ 'opowiadanie', @@ -391,9 +292,9 @@ class Index(BaseIndex): if book_info is None: book_info = dcparser.parse(open(book.xml_file.path)) - fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS) - fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags') - fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED) + fields['slug'] = book.slug + fields['tags'] = [t.name for t in book.tags] + fields['is_book'] = True # validator, name for field in dcparser.BookInfo.FIELDS: @@ -408,21 +309,17 @@ class Index(BaseIndex): s = getattr(book_info, field.name) if field.multiple: s = ', '.join(s) - try: - fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED) - except JavaError as je: - raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args)) + fields[field.name] = s elif type_indicator == dcparser.as_person: p = getattr(book_info, field.name) if isinstance(p, dcparser.Person): persons = unicode(p) else: persons = ', '.join(map(unicode, p)) - fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED) + fields[field.name] = persons elif type_indicator == dcparser.as_date: dt = getattr(book_info, field.name) - fields[field.name] = Field(field.name, "%04d%02d%02d" %\ - (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED) + fields[field.name] = dt # get published date pd = None @@ -431,19 +328,19 @@ class Index(BaseIndex): if match is not None: pd = str(match.groups()[0]) if not pd: pd = "" - fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED) + fields["published_date"] = pd return fields - def add_gaps(self, fields, fieldname): - """ - Interposes a list of fields with gap-fields, which are indexed spaces and returns it. - This allows for doing phrase queries which do not overlap the gaps (when slop is 0). - """ - def gap(): - while True: - yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED) - return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1] + # def add_gaps(self, fields, fieldname): + # """ + # Interposes a list of fields with gap-fields, which are indexed spaces and returns it. + # This allows for doing phrase queries which do not overlap the gaps (when slop is 0). + # """ + # def gap(): + # while True: + # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED) + # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1] def get_master(self, root): """ @@ -453,7 +350,7 @@ class Index(BaseIndex): if master.tag in self.master_tags: return master - def index_content(self, book, book_fields=[]): + def index_content(self, book, book_fields={}): """ Walks the book XML and extract content from it. Adds parts for each header tag and for each fragment. @@ -496,41 +393,31 @@ class Index(BaseIndex): def add_part(snippets, **fields): doc = self.create_book_doc(book) - for f in book_fields: - doc.add(f) + for n, v in book_fields.items(): + doc[n] = v + + doc['header_index'] = fields["header_index"] + doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1 + doc['header_type'] = fields['header_type'] - doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"])) - doc.add(NumericField("header_span", Field.Store.YES, True)\ - .setIntValue('header_span' in fields and fields['header_span'] or 1)) - doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED)) + doc['text'] = fields['text'] - doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \ - Field.TermVector.WITH_POSITIONS_OFFSETS)) + # snippets + snip_pos = snippets.add(fields["text"]) - snip_pos = snippets.add(fields["content"]) - doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0])) - doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1])) + doc['snippets_position'] = snip_pos[0] + doc['snippets_length'] = snip_pos[1] if snippets.revision: - doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision)) + doc["snippets_revision"] = snippets.revision if 'fragment_anchor' in fields: - doc.add(Field("fragment_anchor", fields['fragment_anchor'], - Field.Store.YES, Field.Index.NOT_ANALYZED)) + doc["fragment_anchor"] = fields['fragment_anchor'] if 'themes' in fields: - themes, themes_pl = zip(*[ - (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS), - Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS)) - for theme in fields['themes']]) - - themes = self.add_gaps(themes, 'themes') - themes_pl = self.add_gaps(themes_pl, 'themes_pl') - - for t in themes: - doc.add(t) - for t in themes_pl: - doc.add(t) - + doc['themes'] = fields['themes'] + doc['uid'] = "part%s%s%s" % (doc['header_index'], + doc['header_span'], + doc.get('fragment_anchor', '')) return doc def give_me_utf8(s): @@ -555,39 +442,39 @@ class Index(BaseIndex): def all_content(text): for frag in fragments.values(): - frag['content'].append(text) + frag['text'].append(text) content.append(text) handle_text = [all_content] - for start, text, end in walker(header, ignore_tags=self.ignore_content_tags): # handle footnotes if start is not None and start.tag in self.footnote_tags: footnote = [] + def collect_footnote(t): footnote.append(t) + handle_text.append(collect_footnote) elif end is not None and footnote is not [] and end.tag in self.footnote_tags: handle_text.pop() doc = add_part(snippets, header_index=position, header_type=header.tag, - content=u''.join(footnote), - is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)) - - self.index.addDocument(doc) + text=u''.join(footnote), + is_footnote=True) + self.index.add(doc) #print "@ footnote text: %s" % footnote footnote = [] - + # handle fragments and themes. if start is not None and start.tag == 'begin': fid = start.attrib['id'][1:] - fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag} + fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag} # themes for this fragment elif start is not None and start.tag == 'motyw': fid = start.attrib['id'][1:] handle_text.append(None) if start.text is not None: - fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(','))) + fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(',')))) elif end is not None and end.tag == 'motyw': handle_text.pop() @@ -605,10 +492,10 @@ class Index(BaseIndex): header_index=frag['start_section'], header_span=position - frag['start_section'] + 1, fragment_anchor=fid, - content=fix_format(frag['content']), + text=fix_format(frag['text']), themes=frag['themes']) #print '@ FRAG %s' % frag['content'] - self.index.addDocument(doc) + self.index.add(doc) # Collect content. @@ -618,147 +505,64 @@ class Index(BaseIndex): hdl(text) # in the end, add a section text. - doc = add_part(snippets, header_index=position, header_type=header.tag, - content=fix_format(content)) + doc = add_part(snippets, header_index=position, + header_type=header.tag, text=fix_format(content)) #print '@ CONTENT: %s' % fix_format(content) - self.index.addDocument(doc) + self.index.add(doc) finally: snippets.close() -def log_exception_wrapper(f): - def _wrap(*a): - try: - f(*a) - except Exception, e: - log.error("Error in indexing thread: %s" % e) - traceback.print_exc() - raise e - return _wrap - - -class ReusableIndex(Index): - """ - Works like index, but does not close/optimize Lucene index - until program exit (uses atexit hook). - This is usefull for importbooks command. - - if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself. - """ - index = None - - def open(self, analyzer=None, **kw): - if ReusableIndex.index: - self.index = ReusableIndex.index - else: - Index.open(self, analyzer, **kw) - ReusableIndex.index = self.index - atexit.register(ReusableIndex.close_reusable) - - # def index_book(self, *args, **kw): - # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw) - # ReusableIndex.pool_jobs.append(job) - - @staticmethod - def close_reusable(): - if ReusableIndex.index: - ReusableIndex.index.optimize() - ReusableIndex.index.close() - ReusableIndex.index = None - - index_changed.send_robust(None) - - def close(self): - if ReusableIndex.index: - ReusableIndex.index.commit() - - -class JoinSearch(object): - """ - This mixin could be used to handle block join queries. - (currently unused) - """ - def __init__(self, *args, **kw): - super(JoinSearch, self).__init__(*args, **kw) - - def wrapjoins(self, query, fields=[]): - """ - This functions modifies the query in a recursive way, - so Term and Phrase Queries contained, which match - provided fields are wrapped in a BlockJoinQuery, - and so delegated to children documents. - """ - if BooleanQuery.instance_(query): - qs = BooleanQuery.cast_(query) - for clause in qs: - clause = BooleanClause.cast_(clause) - clause.setQuery(self.wrapjoins(clause.getQuery(), fields)) - return qs - else: - termset = HashSet() - query.extractTerms(termset) - for t in termset: - t = Term.cast_(t) - if t.field() not in fields: - return query - return BlockJoinQuery(query, self.parent_filter, - BlockJoinQuery.ScoreMode.Total) - - def bsearch(self, query, max_results=50): - q = self.query(query) - bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg) - - tops = self.searcher.search(bjq, max_results) - bks = [] - for found in tops.scoreDocs: - doc = self.searcher.doc(found.doc) - bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id"))) - return (bks, tops.totalHits) - - class SearchResult(object): - def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None): - if tokens_cache is None: tokens_cache = {} - - if score: - self._score = score - else: - self._score = scoreDocs.score - + def __init__(self, doc, how_found=None, query=None, query_terms=None): + # self.search = search self.boost = 1.0 - self._hits = [] self._processed_hits = None # processed hits + self.snippets = [] + self.query_terms = query_terms + + if 'score' in doc: + self._score = doc['score'] + else: + self._score = 0 - stored = search.searcher.doc(scoreDocs.doc) - self.book_id = int(stored.get("book_id")) + self.book_id = int(doc["book_id"]) - pd = stored.get("published_date") try: - self.published_date = int(pd) + self.published_date = int(doc.get("published_date")) except ValueError: self.published_date = 0 - header_type = stored.get("header_type") + # content hits + header_type = doc.get("header_type", None) # we have a content hit in some header of fragment if header_type is not None: - sec = (header_type, int(stored.get("header_index"))) - header_span = stored.get('header_span') + sec = (header_type, int(doc["header_index"])) + header_span = doc['header_span'] header_span = header_span is not None and int(header_span) or 1 - - fragment = stored.get("fragment_anchor") - - if snippets: - snippets = snippets.replace("/\n", "\n") - hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []}) + fragment = doc.get("fragment_anchor", None) + snippets_pos = (doc['snippets_position'], doc['snippets_length']) + snippets_rev = doc['snippets_revision'] + + hit = (sec + (header_span,), fragment, self._score, { + 'how_found': how_found, + 'snippets_pos': snippets_pos, + 'snippets_revision': snippets_rev, + 'themes': doc.get('themes', []), + 'themes_pl': doc.get('themes_pl', []) + }) self._hits.append(hit) - self.search = search - self.searched = searched - self.tokens_cache = tokens_cache + def __unicode__(self): + return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \ + (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets)) + + def __str__(self): + return unicode(self).encode('utf-8') @property def score(self): @@ -775,31 +579,32 @@ class SearchResult(object): def get_book(self): if hasattr(self, '_book'): return self._book - return catalogue.models.Book.objects.get(id=self.book_id) + self._book = catalogue.models.Book.objects.get(id=self.book_id) + return self._book book = property(get_book) + POSITION = 0 + FRAGMENT = 1 + POSITION_INDEX = 1 + POSITION_SPAN = 2 + SCORE = 2 + OTHER = 3 + @property def hits(self): if self._processed_hits is not None: return self._processed_hits - POSITION = 0 - FRAGMENT = 1 - POSITION_INDEX = 1 - POSITION_SPAN = 2 - SCORE = 2 - OTHER = 3 - # to sections and fragments - frags = filter(lambda r: r[FRAGMENT] is not None, self._hits) + frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits) - sect = filter(lambda r: r[FRAGMENT] is None, self._hits) + sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits) # sections not covered by fragments sect = filter(lambda s: 0 == len(filter( - lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX] - and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN], + lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX] + and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect) hits = [] @@ -815,55 +620,61 @@ class SearchResult(object): return els.values() # remove fragments with duplicated fid's and duplicated snippets - frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE])) - frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT], - lambda a, b: cmp(a[SCORE], b[SCORE])) + frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE])) + # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT], + # lambda a, b: cmp(a[SCORE], b[SCORE])) # remove duplicate sections sections = {} for s in sect: - si = s[POSITION][POSITION_INDEX] + si = s[self.POSITION][self.POSITION_INDEX] # skip existing if si in sections: - if sections[si]['score'] >= s[SCORE]: + if sections[si]['score'] >= s[self.SCORE]: continue - m = {'score': s[SCORE], - 'section_number': s[POSITION][POSITION_INDEX] + 1, + m = {'score': s[self.SCORE], + 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1, } - m.update(s[OTHER]) + m.update(s[self.OTHER]) sections[si] = m hits = sections.values() for f in frags: try: - frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id) + frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id) except catalogue.models.Fragment.DoesNotExist: # stale index continue - # Figure out if we were searching for a token matching some word in theme name. themes = frag.tags.filter(category='theme') - themes_hit = [] - if self.searched is not None: - tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache) - for theme in themes: - name_tokens = self.search.get_tokens(theme.name, 'POLISH') - for t in tokens: - if t in name_tokens: - if not theme in themes_hit: - themes_hit.append(theme) + themes_hit = set() + if self.query_terms is not None: + for i in range(0, len(f[self.OTHER]['themes'])): + tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ') + tms = map(unicode.lower, tms) + for qt in self.query_terms: + if qt in tms: + themes_hit.add(f[self.OTHER]['themes'][i]) break - m = {'score': f[SCORE], + def theme_by_name(n): + th = filter(lambda t: t.name == n, themes) + if th: + return th[0] + else: + return None + themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit)) + + m = {'score': f[self.SCORE], 'fragment': frag, - 'section_number': f[POSITION][POSITION_INDEX] + 1, + 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1, 'themes': themes, 'themes_hit': themes_hit } - m.update(f[OTHER]) + m.update(f[self.OTHER]) hits.append(m) hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True) @@ -872,9 +683,6 @@ class SearchResult(object): return hits - def __unicode__(self): - return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score) - @staticmethod def aggregate(*result_lists): books = {} @@ -894,511 +702,306 @@ class SearchResult(object): else: return c + def __len__(self): + return len(self.hits) -class Hint(object): - """ - Given some hint information (information we already know about) - our search target - like author, title (specific book), epoch, genre, kind - we can narrow down search using filters. - """ - def __init__(self, search): - """ - Accepts a Searcher instance. - """ - self.search = search - self.book_tags = {} - self.part_tags = [] - self._books = [] - - def books(self, *books): - """ - Give a hint that we search these books. - """ - self._books = books + def snippet_pos(self, idx=0): + return self.hits[idx]['snippets_pos'] - def tags(self, tags): - """ - Give a hint that these Tag objects (a list of) - is necessary. - """ - for t in tags: - if t.category in ['author', 'title', 'epoch', 'genre', 'kind']: - lst = self.book_tags.get(t.category, []) - lst.append(t) - self.book_tags[t.category] = lst - if t.category in ['theme', 'theme_pl']: - self.part_tags.append(t) - - def tag_filter(self, tags, field='tags'): - """ - Given a lsit of tags and an optional field (but they are normally in tags field) - returns a filter accepting only books with specific tags. - """ - q = BooleanQuery() - - for tag in tags: - toks = self.search.get_tokens(tag.name, field=field) - tag_phrase = PhraseQuery() - for tok in toks: - tag_phrase.add(Term(field, tok)) - q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST)) - - return QueryWrapperFilter(q) - - def book_filter(self): - """ - Filters using book tags (all tag kinds except a theme) - """ - tags = reduce(lambda a, b: a + b, self.book_tags.values(), []) - if tags: - return self.tag_filter(tags) - else: + def snippet_revision(self, idx=0): + try: + return self.hits[idx]['snippets_revision'] + except: return None - def part_filter(self): - """ - This filter can be used to look for book parts. - It filters on book id and/or themes. - """ - fs = [] - if self.part_tags: - fs.append(self.tag_filter(self.part_tags, field='themes')) - - if self._books != []: - bf = BooleanFilter() - for b in self._books: - id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True) - bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD)) - fs.append(bf) - - return Search.chain_filters(fs) - - def should_search_for_book(self): - return self._books == [] - - def just_search_in(self, all): - """Holds logic to figure out which indexes should be search, when we have some hinst already""" - some = [] - for field in all: - if field == 'authors' and 'author' in self.book_tags: - continue - if field == 'title' and self._books != []: - continue - if (field == 'themes' or field == 'themes_pl') and self.part_tags: - continue - some.append(field) - return some - -class Search(IndexStore): +class Search(SolrIndex): """ Search facilities. """ - def __init__(self, default_field="content"): - IndexStore.__init__(self) - self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34) - # self.analyzer = WLAnalyzer() - reader = IndexReader.open(self.store, True) - self.searcher = IndexSearcher(reader) - self.parser = QueryParser(Version.LUCENE_34, default_field, - self.analyzer) - - self.parent_filter = TermsFilter() - self.parent_filter.addTerm(Term("is_book", "true")) - index_changed.connect(self.reopen) - - def close(self): - reader = self.searcher.getIndexReader() - self.searcher.close() - reader.close() - super(Search, self).close() - index_changed.disconnect(self.reopen) - - def reopen(self, **unused): - reader = self.searcher.getIndexReader() - rdr = reader.reopen() - if not rdr.equals(reader): - log.debug('Reopening index') - oldsearch = self.searcher - self.searcher = IndexSearcher(rdr) - oldsearch.close() - reader.close() - - def query(self, query): - """Parse query in default Lucene Syntax. (for humans) - """ - return self.parser.parse(query) - - def simple_search(self, query, max_results=50): - """Runs a query for books using lucene syntax. (for humans) - Returns (books, total_hits) - """ - - tops = self.searcher.search(self.query(query), max_results) - bks = [] - for found in tops.scoreDocs: - doc = self.searcher.doc(found.doc) - bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id"))) - return (bks, tops.totalHits) - - def get_tokens(self, searched, field='content', cached=None): - """returns tokens analyzed by a proper (for a field) analyzer - argument can be: StringReader, string/unicode, or tokens. In the last case - they will just be returned (so we can reuse tokens, if we don't change the analyzer) - """ - if cached is not None and field in cached: - return cached[field] - - if isinstance(searched, str) or isinstance(searched, unicode): - searched = StringReader(searched) - elif isinstance(searched, list): - return searched - - searched.reset() - tokens = self.analyzer.reusableTokenStream(field, searched) - toks = [] - while tokens.incrementToken(): - cta = tokens.getAttribute(CharTermAttribute.class_) - toks.append(cta.toString()) - - if cached is not None: - cached[field] = toks - - return toks - - @staticmethod - def fuzziness(fuzzy): - """Helper method to sanitize fuzziness""" - if not fuzzy: - return None - if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0: - return fuzzy - else: - return 0.5 - - def make_phrase(self, tokens, field='content', slop=2, fuzzy=False): - """ - Return a PhraseQuery with a series of tokens. - """ - if fuzzy: - phrase = MultiPhraseQuery() - for t in tokens: - term = Term(field, t) - fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy)) - fuzzterms = [] - - while True: - ft = fuzzterm.term() - if ft: - fuzzterms.append(ft) - if not fuzzterm.next(): break - if fuzzterms: - phrase.add(JArray('object')(fuzzterms, Term)) - else: - phrase.add(term) - else: - phrase = PhraseQuery() - phrase.setSlop(slop) - for t in tokens: - term = Term(field, t) - phrase.add(term) - return phrase + def __init__(self, default_field="text"): + super(Search, self).__init__(mode='r') - @staticmethod - def make_term_query(tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False): + # def get_tokens(self, searched, field='text', cached=None): + # """returns tokens analyzed by a proper (for a field) analyzer + # argument can be: StringReader, string/unicode, or tokens. In the last case + # they will just be returned (so we can reuse tokens, if we don't change the analyzer) + # """ + # if cached is not None and field in cached: + # return cached[field] + + # if isinstance(searched, str) or isinstance(searched, unicode): + # searched = StringReader(searched) + # elif isinstance(searched, list): + # return searched + + # searched.reset() + # tokens = self.analyzer.reusableTokenStream(field, searched) + # toks = [] + # while tokens.incrementToken(): + # cta = tokens.getAttribute(CharTermAttribute.class_) + # toks.append(cta.toString()) + + # if cached is not None: + # cached[field] = toks + + # return toks + + # @staticmethod + # def fuzziness(fuzzy): + # """Helper method to sanitize fuzziness""" + # if not fuzzy: + # return None + # if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0: + # return fuzzy + # else: + # return 0.5 + + # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False): + # """ + # Return a PhraseQuery with a series of tokens. + # """ + # if fuzzy: + # phrase = MultiPhraseQuery() + # for t in tokens: + # term = Term(field, t) + # fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy)) + # fuzzterms = [] + + # while True: + # ft = fuzzterm.term() + # if ft: + # fuzzterms.append(ft) + # if not fuzzterm.next(): break + # if fuzzterms: + # phrase.add(JArray('object')(fuzzterms, Term)) + # else: + # phrase.add(term) + # else: + # phrase = PhraseQuery() + # phrase.setSlop(slop) + # for t in tokens: + # term = Term(field, t) + # phrase.add(term) + # return phrase + + def make_term_query(self, query, field='text', modal=operator.or_): """ Returns term queries joined by boolean query. modal - applies to boolean query fuzzy - should the query by fuzzy. """ - q = BooleanQuery() - for t in tokens: - term = Term(field, t) - if fuzzy: - term = FuzzyQuery(term, self.fuzziness(fuzzy)) - else: - term = TermQuery(term) - q.add(BooleanClause(term, modal)) + q = self.index.Q() + q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), + query.split(r" ")), q) + return q - def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False, - filters=None, tokens_cache=None, boost=None, snippets=False, slop=2): + def search_phrase(self, searched, field='text', book=False, + filters=None, + snippets=False): if filters is None: filters = [] - if tokens_cache is None: tokens_cache = {} - - tokens = self.get_tokens(searched, field, cached=tokens_cache) - - query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop) - if book: - filters.append(self.term_filter(Term('is_book', 'true'))) - top = self.searcher.search(query, self.chain_filters(filters), max_results) + if book: filters.append(self.index.Q(is_book=True)) - return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs] + q = self.index.query(**{field: searched}) + q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True) + res = q.execute() + return [SearchResult(found, how_found=u'search_phrase') for found in res] - def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False, - filters=None, tokens_cache=None, boost=None, snippets=True): + def search_some(self, searched, fields, book=True, + filters=None, snippets=True, query_terms=None): + assert isinstance(fields, list) if filters is None: filters = [] - if tokens_cache is None: tokens_cache = {} + if book: filters.append(self.index.Q(is_book=True)) - if book: - filters.append(self.term_filter(Term('is_book', 'true'))) - - query = BooleanQuery() + query = self.index.Q() for fld in fields: - tokens = self.get_tokens(searched, fld, cached=tokens_cache) - - query.add(BooleanClause(self.make_term_query(tokens, field=fld, - fuzzy=fuzzy), BooleanClause.Occur.SHOULD)) - - top = self.searcher.search(query, self.chain_filters(filters), max_results) - - return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache, - snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs] + query = self.index.Q(query | self.make_term_query(searched, fld)) - def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None): - """ - Search for perfect book matches. Just see if the query matches with some author or title, - taking hints into account. - """ - fields_to_search = ['authors', 'title'] - only_in = None - if hint: - if not hint.should_search_for_book(): - return [] - fields_to_search = hint.just_search_in(fields_to_search) - only_in = hint.book_filter() - - qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search] - - books = [] - for q in qrys: - top = self.searcher.search(q, - self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]), - max_results) - for found in top.scoreDocs: - books.append(SearchResult(self, found, how_found="search_perfect_book")) - return books - - def search_book(self, searched, max_results=20, fuzzy=False, hint=None): - fields_to_search = ['tags', 'authors', 'title'] - - only_in = None - if hint: - if not hint.should_search_for_book(): - return [] - fields_to_search = hint.just_search_in(fields_to_search) - only_in = hint.book_filter() - - tokens = self.get_tokens(searched, field='SIMPLE') + query = self.index.query(query) + query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True) + res = query.execute() + return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res] - q = BooleanQuery() - - for fld in fields_to_search: - q.add(BooleanClause(self.make_term_query(tokens, field=fld, - fuzzy=fuzzy), BooleanClause.Occur.SHOULD)) - - books = [] - top = self.searcher.search(q, - self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]), - max_results) - for found in top.scoreDocs: - books.append(SearchResult(self, found, how_found="search_book")) - - return books - - def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None): - """ - Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase()) - some part/fragment of the book. - """ - qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']] + # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None): + # """ + # Search for perfect book matches. Just see if the query matches with some author or title, + # taking hints into account. + # """ + # fields_to_search = ['authors', 'title'] + # only_in = None + # if hint: + # if not hint.should_search_for_book(): + # return [] + # fields_to_search = hint.just_search_in(fields_to_search) + # only_in = hint.book_filter() + + # qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search] + + # books = [] + # for q in qrys: + # top = self.searcher.search(q, + # self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]), + # max_results) + # for found in top.scoreDocs: + # books.append(SearchResult(self, found, how_found="search_perfect_book")) + # return books + + # def search_book(self, searched, max_results=20, fuzzy=False, hint=None): + # fields_to_search = ['tags', 'authors', 'title'] + + # only_in = None + # if hint: + # if not hint.should_search_for_book(): + # return [] + # fields_to_search = hint.just_search_in(fields_to_search) + # only_in = hint.book_filter() + + # tokens = self.get_tokens(searched, field='SIMPLE') + + # q = BooleanQuery() + + # for fld in fields_to_search: + # q.add(BooleanClause(self.make_term_query(tokens, field=fld, + # fuzzy=fuzzy), BooleanClause.Occur.SHOULD)) + + # books = [] + # top = self.searcher.search(q, + # self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]), + # max_results) + # for found in top.scoreDocs: + # books.append(SearchResult(self, found, how_found="search_book")) + + # return books + + # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None): + # """ + # Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase()) + # some part/fragment of the book. + # """ + # qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']] - flt = None - if hint: - flt = hint.part_filter() + # flt = None + # if hint: + # flt = hint.part_filter() - books = [] - for q in qrys: - top = self.searcher.search(q, - self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True), - flt]), - max_results) - for found in top.scoreDocs: - books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts')) + # books = [] + # for q in qrys: + # top = self.searcher.search(q, + # self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True), + # flt]), + # max_results) + # for found in top.scoreDocs: + # books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts')) - return books + # return books - def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None): + def search_everywhere(self, searched, query_terms=None): """ Tries to use search terms to match different fields of book (or its parts). E.g. one word can be an author survey, another be a part of the title, and the rest are some words from third chapter. """ - if tokens_cache is None: tokens_cache = {} books = [] - only_in = None - - if hint: - only_in = hint.part_filter() - # content only query : themes x content - q = BooleanQuery() - - tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache) - tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache) + q = self.make_term_query(searched, 'text') + q_themes = self.make_term_query(searched, 'themes_pl') - # only search in themes when we do not already filter by themes - if hint is None or hint.just_search_in(['themes']) != []: - q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl', - fuzzy=fuzzy), BooleanClause.Occur.MUST)) + query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True) + res = query.execute() - q.add(BooleanClause(self.make_term_query(tokens_pl, field='content', - fuzzy=fuzzy), BooleanClause.Occur.SHOULD)) - - topDocs = self.searcher.search(q, only_in, max_results) - for found in topDocs.scoreDocs: - books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched)) + for found in res: + books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms)) # query themes/content x author/title/tags - q = BooleanQuery() - in_content = BooleanQuery() - in_meta = BooleanQuery() + in_content = self.index.Q() + in_meta = self.index.Q() - for fld in ['themes_pl', 'content']: - in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD)) + for fld in ['themes_pl', 'text']: + in_content |= self.make_term_query(searched, field=fld) for fld in ['tags', 'authors', 'title']: - in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD)) + in_meta |= self.make_term_query(searched, field=fld) - q.add(BooleanClause(in_content, BooleanClause.Occur.MUST)) - q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD)) + q = in_content & in_meta + res = self.index.query(q).field_limit(score=True, all_fields=True).execute() - topDocs = self.searcher.search(q, only_in, max_results) - for found in topDocs.scoreDocs: - books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched)) + for found in res: + books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms)) return books - # def multisearch(self, query, max_results=50): - # """ - # Search strategy: - # - (phrase) OR -> content - # -> title - # -> authors - # - (keywords) -> authors - # -> motyw - # -> tags - # -> content - # """ - # queryreader = StringReader(query) - # tokens = self.get_tokens(queryreader) - - # top_level = BooleanQuery() - # Should = BooleanClause.Occur.SHOULD - - # phrase_level = BooleanQuery() - # phrase_level.setBoost(1.3) - - # p_content = self.make_phrase(tokens, joined=True) - # p_title = self.make_phrase(tokens, 'title') - # p_author = self.make_phrase(tokens, 'author') - - # phrase_level.add(BooleanClause(p_content, Should)) - # phrase_level.add(BooleanClause(p_title, Should)) - # phrase_level.add(BooleanClause(p_author, Should)) - - # kw_level = BooleanQuery() - - # kw_level.add(self.make_term_query(tokens, 'author'), Should) - # j_themes = self.make_term_query(tokens, 'themes', joined=True) - # kw_level.add(j_themes, Should) - # kw_level.add(self.make_term_query(tokens, 'tags'), Should) - # j_con = self.make_term_query(tokens, joined=True) - # kw_level.add(j_con, Should) - - # top_level.add(BooleanClause(phrase_level, Should)) - # top_level.add(BooleanClause(kw_level, Should)) - - # return None - - def get_snippets(self, scoreDoc, query, field='content'): + def get_snippets(self, searchresult, query, field='text', num=1): """ Returns a snippet for found scoreDoc. """ - htmlFormatter = SimpleHTMLFormatter() - highlighter = Highlighter(htmlFormatter, QueryScorer(query)) - - stored = self.searcher.doc(scoreDoc.doc) - - position = stored.get('snippets_position') - length = stored.get('snippets_length') - if position is None or length is None: - return None - revision = stored.get('snippets_revision') - if revision: revision = int(revision) - - # locate content. - book_id = int(stored.get('book_id')) + maxnum = len(searchresult) + if num is None or num < 0 or num > maxnum: + num = maxnum + book_id = searchresult.book_id + revision = searchresult.snippet_revision() snippets = Snippets(book_id, revision=revision) - + snips = [None] * maxnum try: snippets.open() + idx = 0 + while idx < maxnum and num > 0: + position, length = searchresult.snippet_pos(idx) + if position is None or length is None: + continue + text = snippets.get((int(position), + int(length))) + snip = self.index.highlight(text=text, field=field, q=query) + snips[idx] = snip + if snip: + num -= 1 + idx += 1 + except IOError, e: log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e)) return [] + finally: + snippets.close() - try: - try: - text = snippets.get((int(position), - int(length))) - finally: - snippets.close() + # remove verse end markers.. + snips = map(lambda s: s and s.replace("/\n", "\n"), snips) - tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer) - # highlighter.getBestTextFragments(tokenStream, text, False, 10) - snip = highlighter.getBestFragments(tokenStream, text, 3, "...") + searchresult.snippets = snips - except Exception, e: - e2 = e - if hasattr(e, 'getJavaException'): - e2 = unicode(e.getJavaException()) - raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)), - e2) - return snip + return snips - @staticmethod - def enum_to_array(enum): + def hint_tags(self, query, pdcounter=True, prefix=True): """ - Converts a lucene TermEnum to array of Terms, suitable for - addition to queries + Return auto-complete hints for tags + using prefix search. """ - terms = [] - - while True: - t = enum.term() - if t: - terms.append(t) - if not enum.next(): break + q = self.index.Q() + query = query.strip() + for field in ['tag_name', 'tag_name_pl']: + if prefix: + q |= self.index.Q(**{field: query + "*"}) + else: + q |= self.make_term_query(query, field=field) + qu = self.index.query(q).exclude(tag_category="book") - if terms: - return JArray('object')(terms, Term) + return self.search_tags(qu, pdcounter=pdcounter) - def search_tags(self, query, filt=None, max_results=40, pdcounter=False): + def search_tags(self, query, filters=None, pdcounter=False): """ Search for Tag objects using query. """ + if not filters: filters = [] if not pdcounter: - filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)]) - tops = self.searcher.search(query, filt, max_results) + filters.append(~self.index.Q(is_pdcounter=True)) + res = self.apply_filters(query, filters).execute() tags = [] - for found in tops.scoreDocs: - doc = self.searcher.doc(found.doc) - is_pdcounter = doc.get('is_pdcounter') + for doc in res: + is_pdcounter = doc.get('is_pdcounter', False) category = doc.get('tag_category') try: - if is_pdcounter == 'true': + if is_pdcounter == True: if category == 'pd_author': tag = PDCounterAuthor.objects.get(id=doc.get('tag_id')) elif category == 'pd_book': @@ -1420,98 +1023,81 @@ class Search(IndexStore): return tags - def search_books(self, query, filt=None, max_results=10): + def hint_books(self, query, prefix=True): + """ + Returns auto-complete hints for book titles + Because we do not index 'pseudo' title-tags. + Prefix search. + """ + q = self.index.Q() + query = query.strip() + if prefix: + q |= self.index.Q(title=query + "*") + else: + q |= self.make_term_query(query, field='title') + qu = self.index.query(q) + only_books = self.index.Q(is_book=True) + return self.search_books(qu, [only_books]) + + def search_books(self, query, filters=None, max_results=10): """ Searches for Book objects using query """ bks = [] - tops = self.searcher.search(query, filt, max_results) - for found in tops.scoreDocs: - doc = self.searcher.doc(found.doc) + res = self.apply_filters(query, filters).field_limit(['book_id']) + for r in res: try: - bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id"))) + bks.append(catalogue.models.Book.objects.get(id=r['book_id'])) except catalogue.models.Book.DoesNotExist: pass return bks + + # def make_prefix_phrase(self, toks, field): + # q = MultiPhraseQuery() + # for i in range(len(toks)): + # t = Term(field, toks[i]) + # if i == len(toks) - 1: + # pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t)) + # if pterms: + # q.add(pterms) + # else: + # q.add(t) + # else: + # q.add(t) + # return q + + # @staticmethod + # def term_filter(term, inverse=False): + # only_term = TermsFilter() + # only_term.addTerm(term) + + # if inverse: + # neg = BooleanFilter() + # neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT)) + # only_term = neg + + # return only_term - def make_prefix_phrase(self, toks, field): - q = MultiPhraseQuery() - for i in range(len(toks)): - t = Term(field, toks[i]) - if i == len(toks) - 1: - pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t)) - if pterms: - q.add(pterms) - else: - q.add(t) - else: - q.add(t) - return q - - @staticmethod - def term_filter(term, inverse=False): - only_term = TermsFilter() - only_term.addTerm(term) - - if inverse: - neg = BooleanFilter() - neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT)) - only_term = neg - - return only_term - - def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False): - """ - Return auto-complete hints for tags - using prefix search. - """ - toks = self.get_tokens(string, field='SIMPLE') - top = BooleanQuery() - - for field in ['tag_name', 'tag_name_pl']: - if prefix: - q = self.make_prefix_phrase(toks, field) - else: - q = self.make_term_query(toks, field, fuzzy=fuzzy) - top.add(BooleanClause(q, BooleanClause.Occur.SHOULD)) - - no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True) - return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter) - - def hint_books(self, string, max_results=50, prefix=True, fuzzy=False): - """ - Returns auto-complete hints for book titles - Because we do not index 'pseudo' title-tags. - Prefix search. - """ - toks = self.get_tokens(string, field='SIMPLE') - - if prefix: - q = self.make_prefix_phrase(toks, 'title') - else: - q = self.make_term_query(toks, 'title', fuzzy=fuzzy) - - return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results) @staticmethod - def chain_filters(filters, op=ChainedFilter.AND): + def apply_filters(query, filters): """ - Chains a filter list together + Apply filters to a query """ + if filters is None: filters = [] filters = filter(lambda x: x is not None, filters) - if not filters or filters is []: - return None - chf = ChainedFilter(JArray('object')(filters, Filter), op) - return chf + for f in filters: + query = query.query(f) + return query - def filtered_categories(self, tags): - """ - Return a list of tag categories, present in tags list. - """ - cats = {} - for t in tags: - cats[t.category] = True - return cats.keys() + # def filtered_categories(self, tags): + # """ + # Return a list of tag categories, present in tags list. + # """ + # cats = {} + # for t in tags: + # cats[t.category] = True + # return cats.keys() - def hint(self): - return Hint(self) + # def hint(self): + # return Hint(self) diff --git a/apps/search/templates/newsearch/search.html b/apps/search/templates/newsearch/search.html index c494ca602..635bae86a 100644 --- a/apps/search/templates/newsearch/search.html +++ b/apps/search/templates/newsearch/search.html @@ -29,11 +29,9 @@ <li> <p><a href="{{result.book.get_absolute_url}}">{{result.book.pretty_title}}</a> (id: {{result.book_id}}, score: {{result.score}})</p> <ul> - {% for hit in result.hits %} + {% for snip in hit.snippets %} <li> - {% for snip in hit.3.snippets %} - {{snip|safe}}<br/> - {% endfor %} + {{snip|safe}}<br/> </li> {% endfor %} diff --git a/apps/search/templatetags/search_tags.py b/apps/search/templatetags/search_tags.py index 97deb9d13..a167f024d 100644 --- a/apps/search/templatetags/search_tags.py +++ b/apps/search/templatetags/search_tags.py @@ -6,15 +6,11 @@ # import datetime from django import template -from django.template import Node, Variable -from django.utils.encoding import smart_str -from django.core.urlresolvers import reverse # from django.contrib.auth.forms import UserCreationForm, AuthenticationForm # from django.db.models import Q -from django.conf import settings # from django.utils.translation import ugettext as _ -from catalogue.templatetags.catalogue_tags import book_wide from catalogue.models import Book +import re # from catalogue.forms import SearchForm # from catalogue.utils import split_tags @@ -35,18 +31,28 @@ def book_searched(context, result): # We don't need hits which lead to sections but do not have # snippets. - hits = filter(lambda h: 'fragment' in h or - h['snippets'], result.hits)[0:5] - - for hit in hits: - hit['snippets'] = map(lambda s: s.replace("\n", "<br />").replace('---', '—'), hit['snippets']) + hits = filter(lambda (idx, h): + result.snippets[idx] is not None + or 'fragment' in h, enumerate(result.hits)) + # print "[tmpl: from %d hits selected %d]" % (len(result.hits), len(hits)) + + for (idx, hit) in hits: + # currently we generate one snipper per hit though. + if result.snippets[idx] is None: + continue + snip = result.snippets[idx] + # fix some formattting + snip = re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"", + re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", snip)[0])[0] + + snip = snip.replace("\n", "<br />").replace('---', '—') + hit['snippet'] = snip return { 'related': book.related_info(), 'book': book, 'main_link': book.get_absolute_url(), 'request': context.get('request'), - 'hits': hits, + 'hits': hits and zip(*hits)[1] or [], 'main_link': book.get_absolute_url(), } - diff --git a/apps/search/views.py b/apps/search/views.py index ec8275b91..72852d0d4 100644 --- a/apps/search/views.py +++ b/apps/search/views.py @@ -12,7 +12,7 @@ from catalogue.utils import split_tags from catalogue.models import Book, Tag, Fragment from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook from catalogue.views import JSONResponse -from search import Search, JVM, SearchResult +from search import Search, SearchResult from lucene import StringReader from suggest.forms import PublishingSuggestForm from time import sleep @@ -20,8 +20,6 @@ import re #import enchant import json -#dictionary = enchant.Dict('en_US') - def match_word_re(word): if 'sqlite' in settings.DATABASES['default']['ENGINE']: @@ -31,60 +29,37 @@ def match_word_re(word): def did_you_mean(query, tokens): - change = {} - for t in tokens: - authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t)) - if len(authors) > 0: - continue - - if False: - if not dictionary.check(t): - try: - change_to = dictionary.suggest(t)[0].lower() - if change_to != t.lower(): - change[t] = change_to - except IndexError: - pass - - if change == {}: - return None - - for frm, to in change.items(): - query = query.replace(frm, to) - return query + # change = {} + # for t in tokens: + # authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t)) + # if len(authors) > 0: + # continue + # if False: + # if not dictionary.check(t): + # try: + # change_to = dictionary.suggest(t)[0].lower() + # if change_to != t.lower(): + # change[t] = change_to + # except IndexError: + # pass -JVM.attachCurrentThread() -_search = None - + # if change == {}: + # return None -def get_search(): - global _search + # for frm, to in change.items(): + # query = query.replace(frm, to) - while _search is False: - sleep(1) - - if _search is None: - _search = False - _search = Search() - return _search + # return query def hint(request): prefix = request.GET.get('term', '') if len(prefix) < 2: return JSONResponse([]) - JVM.attachCurrentThread() - - search = get_search() - hint = search.hint() - try: - tags = request.GET.get('tags', '') - hint.tags(Tag.get_tag_list(tags)) - except: - pass + search = Search() # tagi beda ograniczac tutaj # ale tagi moga byc na ksiazce i na fragmentach # jezeli tagi dot tylko ksiazki, to wazne zeby te nowe byly w tej samej ksiazce @@ -93,7 +68,6 @@ def hint(request): tags = search.hint_tags(prefix, pdcounter=True) books = search.hint_books(prefix) - def is_dupe(tag): if isinstance(tag, PDCounterAuthor): if filter(lambda t: t.slug == tag.slug and t != tag, tags): @@ -126,64 +100,51 @@ def hint(request): content_type="application/json; charset=utf-8") else: return JSONResponse(data) - def main(request): results = {} - JVM.attachCurrentThread() # where to put this? results = None query = None - fuzzy = False #0.8 - - query = request.GET.get('q','') - # book_id = request.GET.get('book', None) - # book = None - # if book_id is not None: - # book = get_object_or_404(Book, id=book_id) - # hint = search.hint() - # try: - # tag_list = Tag.get_tag_list(tags) - # except: - # tag_list = [] + query = request.GET.get('q', '') if len(query) < 2: - return render_to_response('catalogue/search_too_short.html', {'prefix': query}, - context_instance=RequestContext(request)) - - search = get_search() - # hint.tags(tag_list) - # if book: - # hint.books(book) - tags = search.hint_tags(query, pdcounter=True, prefix=False, fuzzy=fuzzy) - tags = split_tags(tags) + return render_to_response('catalogue/search_too_short.html', + {'prefix': query}, + context_instance=RequestContext(request)) + search = Search() - toks = StringReader(query) - tokens_cache = {} + theme_terms = search.index.analyze(text=query, field="themes_pl") \ + + search.index.analyze(text=query, field="themes") - author_results = search.search_phrase(toks, 'authors', fuzzy=fuzzy, tokens_cache=tokens_cache) - title_results = search.search_phrase(toks, 'title', fuzzy=fuzzy, tokens_cache=tokens_cache) + # change hints + tags = search.hint_tags(query, pdcounter=True, prefix=False) + tags = split_tags(tags) + + author_results = search.search_phrase(query, 'authors', book=True) + title_results = search.search_phrase(query, 'title', book=True) # Boost main author/title results with mixed search, and save some of its results for end of list. # boost author, title results - author_title_mixed = search.search_some(toks, ['authors', 'title', 'tags'], fuzzy=fuzzy, tokens_cache=tokens_cache) + author_title_mixed = search.search_some(query, ['authors', 'title', 'tags'], query_terms=theme_terms) author_title_rest = [] + for b in author_title_mixed: - bks = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results) - for b2 in bks: + also_in_mixed = filter(lambda ba: ba.book_id == b.book_id, author_results + title_results) + for b2 in also_in_mixed: b2.boost *= 1.1 - if bks is []: + if also_in_mixed is []: author_title_rest.append(b) # Do a phrase search but a term search as well - this can give us better snippets then search_everywhere, # Because the query is using only one field. text_phrase = SearchResult.aggregate( - search.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False, slop=4), - search.search_some(toks, ['content'], tokens_cache=tokens_cache, snippets=True, book=False)) + search.search_phrase(query, 'text', snippets=True, book=False), + search.search_some(query, ['text'], snippets=True, book=False, query_terms=theme_terms)) - everywhere = search.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache) + everywhere = search.search_everywhere(query, query_terms=theme_terms) def already_found(results): def f(e): @@ -202,15 +163,15 @@ def main(request): everywhere = SearchResult.aggregate(everywhere, author_title_rest) - for res in [author_results, title_results, text_phrase, everywhere]: + for field, res in [('authors', author_results), + ('title', title_results), + ('text', text_phrase), + ('text', everywhere)]: res.sort(reverse=True) for r in res: - for h in r.hits: - h['snippets'] = map(lambda s: - re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"", - re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets']) + search.get_snippets(r, query, field, 3) - suggestion = did_you_mean(query, search.get_tokens(toks, field="SIMPLE")) + suggestion = u'' def ensure_exists(r): try: @@ -246,9 +207,9 @@ def main(request): return render_to_response('catalogue/search_multiple_hits.html', {'tags': tags, 'prefix': query, - 'results': { 'author': author_results, - 'title': title_results, - 'content': text_phrase, - 'other': everywhere}, + 'results': {'author': author_results, + 'title': title_results, + 'content': text_phrase, + 'other': everywhere}, 'did_you_mean': suggestion}, context_instance=RequestContext(request)) diff --git a/doc/schema.xml b/doc/schema.xml new file mode 100644 index 000000000..510a9610b --- /dev/null +++ b/doc/schema.xml @@ -0,0 +1,1150 @@ +<?xml version="1.0" encoding="UTF-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<!-- + This is the Solr schema file. This file should be named "schema.xml" and + should be in the conf directory under the solr home + (i.e. ./solr/conf/schema.xml by default) + or located where the classloader for the Solr webapp can find it. + + This example schema is the recommended starting point for users. + It should be kept correct and concise, usable out-of-the-box. + + For more information, on how to customize this file, please see + http://wiki.apache.org/solr/SchemaXml + + PERFORMANCE NOTE: this schema includes many optional features and should not + be used for benchmarking. To improve performance one could + - set stored="false" for all fields possible (esp large fields) when you + only need to search on the field but don't need to return the original + value. + - set indexed="false" if you don't need to search on the field, but only + return the field as a result of searching on other indexed fields. + - remove all unneeded copyField statements + - for best index size and searching performance, set "index" to false + for all general text fields, use copyField to copy them to the + catchall "text" field, and use that for searching. + - For maximum indexing performance, use the StreamingUpdateSolrServer + java client. + - Remember to run the JVM in server mode, and use a higher logging level + that avoids logging every request +--> + +<schema name="example" version="1.5"> + <!-- attribute "name" is the name of this schema and is only used for display purposes. + version="x.y" is Solr's version number for the schema syntax and semantics. It should + not normally be changed by applications. + 1.0: multiValued attribute did not exist, all fields are multiValued by nature + 1.1: multiValued attribute introduced, false by default + 1.2: omitTermFreqAndPositions attribute introduced, true by default except for text fields. + 1.3: removed optional field compress feature + 1.4: default auto-phrase (QueryParser feature) to off + 1.5: omitNorms defaults to true for primitive field types (int, float, boolean, string...) + --> + + <fields> + <!-- Valid attributes for fields: + name: mandatory - the name for the field + type: mandatory - the name of a field type from the + <types> fieldType section + indexed: true if this field should be indexed (searchable or sortable) + stored: true if this field should be retrievable + multiValued: true if this field may contain multiple values per document + omitNorms: (expert) set to true to omit the norms associated with + this field (this disables length normalization and index-time + boosting for the field, and saves some memory). Only full-text + fields or fields that need an index-time boost need norms. + Norms are omitted for primitive (non-analyzed) types by default. + termVectors: [false] set to true to store the term vector for a + given field. + When using MoreLikeThis, fields used for similarity should be + stored for best performance. + termPositions: Store position information with the term vector. + This will increase storage costs. + termOffsets: Store offset information with the term vector. This + will increase storage costs. + required: The field is required. It will throw an error if the + value does not exist + default: a value that should be used if no value is specified + when adding a document. + --> + + <!-- field names should consist of alphanumeric or underscore characters only and + not start with a digit. This is not currently strictly enforced, + but other field names will not have first class support from all components + and back compatibility is not guaranteed. Names with both leading and + trailing underscores (e.g. _version_) are reserved. + --> + + <!-- <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" /> --> + <!-- <field name="sku" type="text_en_splitting_tight" indexed="true" stored="true" omitNorms="true"/> --> + <!-- <field name="name" type="text_general" indexed="true" stored="true"/> --> + <!-- <field name="manu" type="text_general" indexed="true" stored="true" omitNorms="true"/> --> + <!-- <field name="cat" type="string" indexed="true" stored="true" multiValued="true"/> --> + <!-- <field name="features" type="text_general" indexed="true" stored="true" multiValued="true"/> --> + <!-- <field name="includes" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" /> --> + + <!-- <field name="weight" type="float" indexed="true" stored="true"/> --> + <!-- <field name="price" type="float" indexed="true" stored="true"/> --> + <!-- <field name="popularity" type="int" indexed="true" stored="true" /> --> + <!-- <field name="inStock" type="boolean" indexed="true" stored="true" /> --> + + <!-- <field name="store" type="location" indexed="true" stored="true"/> --> + + <!-- Common metadata fields, named specifically to match up with + SolrCell metadata when parsing rich documents such as Word, PDF. + Some fields are multiValued only because Tika currently may return + multiple values for them. Some metadata is parsed from the documents, + but there are some which come from the client context: + "content_type": From the HTTP headers of incoming stream + "resourcename": From SolrCell request param resource.name + --> + <!-- <field name="title" type="text_general" indexed="true" stored="true" multiValued="true"/> --> + <!-- <field name="subject" type="text_general" indexed="true" stored="true"/> --> + <!-- <field name="description" type="text_general" indexed="true" stored="true"/> --> + <!-- <field name="comments" type="text_general" indexed="true" stored="true"/> --> + <!-- <field name="author" type="text_general" indexed="true" stored="true"/> --> + <!-- <field name="keywords" type="text_general" indexed="true" stored="true"/> --> + <!-- <field name="category" type="text_general" indexed="true" stored="true"/> --> + <!-- <field name="resourcename" type="text_general" indexed="true" stored="true"/> --> + <!-- <field name="url" type="text_general" indexed="true" stored="true"/> --> + <!-- <field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/> --> + <!-- <field name="last_modified" type="date" indexed="true" stored="true"/> --> + <!-- <field name="links" type="string" indexed="true" stored="true" multiValued="true"/> --> + + <field name="book_id" type="int" indexed="true" stored="true" /> + <field name="parent_id" type="int" indexed="false" stored="true" /> + <field name="slug" type="text_general" stored="false" indexed="true" omitNorms="true"/> <!-- no norms --> + <field name="tags" type="lowercase" stored="false" indexed="true" multiValued="true"/> + <field name="is_book" type="boolean" stored="false" indexed="true"/> + <field name="authors" type="text_general" stored="false" indexed="true" multiValued="true"/> + <field name="translators" type="text_general" stored="false" indexed="true" multiValued="true"/> + <field name="title" type="text_pl" stored="false" indexed="true"/> + <field name="title_orig" type="text_general" stored="false" indexed="true"/> +<!-- <field name="published_date" type="tdate" stored="false" indexed="true"/>--> + <field name="published_date" type="string" stored="true" indexed="true"/> + + <field name="themes" type="lowercase" stored="true" intexed="true" termVectors="true" termPositions="true" multiValued="true" /> + <field name="themes_pl" type="text_pl" stored="true" indexed="true" termVectors="true" termPositions="true" multiValued="true" /> + <field name="header_index" type="int" stored="true" indexed="true"/> + <field name="header_span" type="int" stored="true" indexed="true"/> + <field name="header_type" type="lowercase" stored="true" indexed="false"/> + <field name="text" type="text_pl" stored="false" indexed="true" termPositions="true" /> + + <field name="snippets_position" type="int" stored="true" indexed="false"/> + <field name="snippets_length" type="int" stored="true" indexed="false"/> + <field name="snippets_revision" type="int" stored="true" indexed="false"/> + <field name="fragment_anchor" type="string" stored="true" indexed="false"/> + + <field name="tag_id" type="int" stored="true" indexed="true"/> + <field name="tag_name" type="lowercase" stored="true" intexed="true" /> + <field name="tag_name_pl" type="text_pl" stored="false" indexed="true" multiValued="true"/> + <field name="tag_category" type="string" stored="true" indexed="true" /> + <field name="is_pdcounter" type="boolean" stored="true" indexed="true" /> + + <!-- Main body of document extracted by SolrCell. + NOTE: This field is not indexed by default, since it is also copied to "text" + using copyField below. This is to save space. Use this field for returning and + highlighting document content. Use the "text" field to search the content. --> + <!-- <field name="content" type="text_general" indexed="false" stored="true" multiValued="true"/> --> + + + <!-- catchall field, containing all other searchable text fields (implemented + via copyField further on in this schema --> + <!-- <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/> --> + + <!-- catchall text field that indexes tokens both normally and in reverse for efficient + leading wildcard queries. --> + <!-- <field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/> --> + + <!-- non-tokenized version of manufacturer to make it easier to sort or group + results by manufacturer. copied from "manu" via copyField --> + <!-- <field name="manu_exact" type="string" indexed="true" stored="false"/> --> + + <!-- <field name="payloads" type="payloads" indexed="true" stored="true"/> --> + + <!-- <field name="_version_" type="long" indexed="true" stored="true"/> --> + + <!-- Uncommenting the following will create a "timestamp" field using + a default value of "NOW" to indicate when each document was indexed. + --> + <!-- + <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/> + --> + + <!-- Dynamic field definitions allow using convention over configuration + for fields via the specification of patterns to match field names. + EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i) + RESTRICTION: the glob-like pattern in the name attribute must have + a "*" only at the start or the end. --> + + <dynamicField name="*_i" type="int" indexed="true" stored="true"/> + <dynamicField name="*_is" type="int" indexed="true" stored="true" multiValued="true"/> + <dynamicField name="*_s" type="string" indexed="true" stored="true" /> + <dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true"/> + <dynamicField name="*_l" type="long" indexed="true" stored="true"/> + <dynamicField name="*_ls" type="long" indexed="true" stored="true" multiValued="true"/> + <dynamicField name="*_t" type="text_general" indexed="true" stored="true"/> + <dynamicField name="*_txt" type="text_general" indexed="true" stored="true" multiValued="true"/> + <dynamicField name="*_en" type="text_en" indexed="true" stored="true" multiValued="true"/> + <dynamicField name="*_b" type="boolean" indexed="true" stored="true"/> + <dynamicField name="*_bs" type="boolean" indexed="true" stored="true" multiValued="true"/> + <dynamicField name="*_f" type="float" indexed="true" stored="true"/> + <dynamicField name="*_fs" type="float" indexed="true" stored="true" multiValued="true"/> + <dynamicField name="*_d" type="double" indexed="true" stored="true"/> + <dynamicField name="*_ds" type="double" indexed="true" stored="true" multiValued="true"/> + + <!-- Type used to index the lat and lon components for the "location" FieldType --> + <dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false" /> + + <dynamicField name="*_dt" type="date" indexed="true" stored="true"/> + <dynamicField name="*_dts" type="date" indexed="true" stored="true" multiValued="true"/> + <dynamicField name="*_p" type="location" indexed="true" stored="true"/> + + <!-- some trie-coded dynamic fields for faster range queries --> + <dynamicField name="*_ti" type="tint" indexed="true" stored="true"/> + <dynamicField name="*_tl" type="tlong" indexed="true" stored="true"/> + <dynamicField name="*_tf" type="tfloat" indexed="true" stored="true"/> + <dynamicField name="*_td" type="tdouble" indexed="true" stored="true"/> + <dynamicField name="*_tdt" type="tdate" indexed="true" stored="true"/> + + <dynamicField name="*_pi" type="pint" indexed="true" stored="true"/> +<!-- <dynamicField name="*_c" type="currency" indexed="true" stored="true"/>--> + + <dynamicField name="ignored_*" type="ignored" multiValued="true"/> + <dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/> + + <dynamicField name="random_*" type="random" /> + + <!-- uncomment the following to ignore any fields that don't already match an existing + field name or dynamic field, rather than reporting them as an error. + alternately, change the type="ignored" to some other type e.g. "text" if you want + unknown fields indexed and/or stored by default --> + <!--dynamicField name="*" type="ignored" multiValued="true" /--> + <field name="uid" type="string" indexed="true" stored="true"/> + </fields> + + + <!-- Field to use to determine and enforce document uniqueness. + Unless this field is marked with required="false", it will be a required field + --> + <uniqueKey>uid</uniqueKey> + + <!-- DEPRECATED: The defaultSearchField is consulted by various query parsers when + parsing a query string that isn't explicit about the field. Machine (non-user) + generated queries are best made explicit, or they can use the "df" request parameter + which takes precedence over this. + Note: Un-commenting defaultSearchField will be insufficient if your request handler + in solrconfig.xml defines "df", which takes precedence. That would need to be removed. + <defaultSearchField>text</defaultSearchField> --> + + <!-- DEPRECATED: The defaultOperator (AND|OR) is consulted by various query parsers + when parsing a query string to determine if a clause of the query should be marked as + required or optional, assuming the clause isn't already marked by some operator. + The default is OR, which is generally assumed so it is not a good idea to change it + globally here. The "q.op" request parameter takes precedence over this. + <solrQueryParser defaultOperator="OR"/> --> + + <!-- copyField commands copy one field to another at the time a document + is added to the index. It's used either to index the same field differently, + or to add multiple fields to the same field for easier/faster searching. --> + + <copyField source="themes" dest="themes_pl"/> + <copyField source="tag_name" dest="tag_name_pl"/> + +<!-- + <copyField source="cat" dest="text"/> + <copyField source="name" dest="text"/> + <copyField source="manu" dest="text"/> + <copyField source="features" dest="text"/> + <copyField source="includes" dest="text"/> + <copyField source="manu" dest="manu_exact"/> +--> + <!-- Copy the price into a currency enabled field (default USD) --> +<!-- <copyField source="price" dest="price_c"/>--> + + <!-- Text fields from SolrCell to search by default in our catch-all field --> +<!-- <copyField source="title" dest="text"/> + <copyField source="author" dest="text"/> + <copyField source="description" dest="text"/> + <copyField source="keywords" dest="text"/> + <copyField source="content" dest="text"/> + <copyField source="content_type" dest="text"/> + <copyField source="resourcename" dest="text"/> + <copyField source="url" dest="text"/>--> + + <!-- Create a string version of author for faceting --> +<!-- <copyField source="author" dest="author_s"/>--> + + <!-- Above, multiple source fields are copied to the [text] field. + Another way to map multiple source fields to the same + destination field is to use the dynamic field syntax. + copyField also supports a maxChars to copy setting. --> + + <!-- <copyField source="*_t" dest="text" maxChars="3000"/> --> + + <!-- copy name to alphaNameSort, a field designed for sorting by name --> + <!-- <copyField source="name" dest="alphaNameSort"/> --> + + <types> + <!-- field type definitions. The "name" attribute is + just a label to be used by field definitions. The "class" + attribute and any other attributes determine the real + behavior of the fieldType. + Class names starting with "solr" refer to java classes in a + standard package such as org.apache.solr.analysis + --> + + <!-- The StrField type is not analyzed, but indexed/stored verbatim. --> + <fieldType name="string" class="solr.StrField" sortMissingLast="true" /> + + <!-- boolean type: "true" or "false" --> + <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/> + + <!-- sortMissingLast and sortMissingFirst attributes are optional attributes are + currently supported on types that are sorted internally as strings + and on numeric types. + This includes "string","boolean", and, as of 3.5 (and 4.x), + int, float, long, date, double, including the "Trie" variants. + - If sortMissingLast="true", then a sort on this field will cause documents + without the field to come after documents with the field, + regardless of the requested sort order (asc or desc). + - If sortMissingFirst="true", then a sort on this field will cause documents + without the field to come before documents with the field, + regardless of the requested sort order. + - If sortMissingLast="false" and sortMissingFirst="false" (the default), + then default lucene sorting will be used which places docs without the + field first in an ascending sort and last in a descending sort. + --> + + <!-- + Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types. + --> + <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/> + <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/> + <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/> + <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/> + + <!-- + Numeric field types that index each value at various levels of precision + to accelerate range queries when the number of values between the range + endpoints is large. See the javadoc for NumericRangeQuery for internal + implementation details. + + Smaller precisionStep values (specified in bits) will lead to more tokens + indexed per value, slightly larger index size, and faster range queries. + A precisionStep of 0 disables indexing at different precision levels. + --> + <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" positionIncrementGap="0"/> + <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" positionIncrementGap="0"/> + <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" positionIncrementGap="0"/> + <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" positionIncrementGap="0"/> + + <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and + is a more restricted form of the canonical representation of dateTime + http://www.w3.org/TR/xmlschema-2/#dateTime + The trailing "Z" designates UTC time and is mandatory. + Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z + All other components are mandatory. + + Expressions can also be used to denote calculations that should be + performed relative to "NOW" to determine the value, ie... + + NOW/HOUR + ... Round to the start of the current hour + NOW-1DAY + ... Exactly 1 day prior to now + NOW/DAY+6MONTHS+3DAYS + ... 6 months and 3 days in the future from the start of + the current day + + Consult the DateField javadocs for more information. + + Note: For faster range queries, consider the tdate type + --> + <fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/> + + <!-- A Trie based date field for faster date range queries and date faceting. --> + <fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/> + + + <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings --> + <fieldtype name="binary" class="solr.BinaryField"/> + + <!-- + Note: + These should only be used for compatibility with existing indexes (created with lucene or older Solr versions). + Use Trie based fields instead. As of Solr 3.5 and 4.x, Trie based fields support sortMissingFirst/Last + + Plain numeric field types that store and index the text + value verbatim (and hence don't correctly support range queries, since the + lexicographic ordering isn't equal to the numeric ordering) + --> + <fieldType name="pint" class="solr.IntField"/> + <fieldType name="plong" class="solr.LongField"/> + <fieldType name="pfloat" class="solr.FloatField"/> + <fieldType name="pdouble" class="solr.DoubleField"/> + <fieldType name="pdate" class="solr.DateField" sortMissingLast="true"/> + + <!-- The "RandomSortField" is not used to store or search any + data. You can declare fields of this type it in your schema + to generate pseudo-random orderings of your docs for sorting + or function purposes. The ordering is generated based on the field + name and the version of the index. As long as the index version + remains unchanged, and the same field name is reused, + the ordering of the docs will be consistent. + If you want different psuedo-random orderings of documents, + for the same version of the index, use a dynamicField and + change the field name in the request. + --> + <fieldType name="random" class="solr.RandomSortField" indexed="true" /> + + <!-- solr.TextField allows the specification of custom text analyzers + specified as a tokenizer and a list of token filters. Different + analyzers may be specified for indexing and querying. + + The optional positionIncrementGap puts space between multiple fields of + this type on the same document, with the purpose of preventing false phrase + matching across fields. + + For more info on customizing your analyzer chain, please see + http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters + --> + + <!-- One can also specify an existing Analyzer class that has a + default constructor via the class attribute on the analyzer element. + Example: + <fieldType name="text_greek" class="solr.TextField"> + <analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/> + </fieldType> + --> + + <fieldType name="uuid" class="solr.UUIDField" indexed="true" /> + + + <!-- A text field that only splits on whitespace for exact matching of words --> + <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + </analyzer> + </fieldType> + + <!-- A general text field that has reasonable, generic + cross-language defaults: it tokenizes with StandardTokenizer, + removes stop words from case-insensitive "stopwords.txt" + (empty by default), and down cases. At query time only, it + also applies synonyms. --> + <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100"> + <analyzer type="index"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> + <!-- in this example, we will only use synonyms at query time + <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> + --> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + </fieldType> + + <!-- A text field with defaults appropriate for English: it + tokenizes with StandardTokenizer, removes English stop words + (lang/stopwords_en.txt), down cases, protects words from protwords.txt, and + finally applies Porter's stemming. The query time analyzer + also applies synonyms from synonyms.txt. --> + <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100"> + <analyzer type="index"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <!-- in this example, we will only use synonyms at query time + <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> + --> + <!-- Case insensitive stop word removal. + add enablePositionIncrements=true in both the index and query + analyzers to leave a 'gap' for more accurate phrase queries. + --> + <filter class="solr.StopFilterFactory" + ignoreCase="true" + words="lang/stopwords_en.txt" + enablePositionIncrements="true" + /> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.EnglishPossessiveFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: + <filter class="solr.EnglishMinimalStemFilterFactory"/> + --> + <filter class="solr.PorterStemFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.StopFilterFactory" + ignoreCase="true" + words="lang/stopwords_en.txt" + enablePositionIncrements="true" + /> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.EnglishPossessiveFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: + <filter class="solr.EnglishMinimalStemFilterFactory"/> + --> + <filter class="solr.PorterStemFilterFactory"/> + </analyzer> + </fieldType> + + <!-- A text field with defaults appropriate for English, plus + aggressive word-splitting and autophrase features enabled. + This field is just like text_en, except it adds + WordDelimiterFilter to enable splitting and matching of + words on case-change, alpha numeric boundaries, and + non-alphanumeric chars. This means certain compound word + cases will work, for example query "wi fi" will match + document "WiFi" or "wi-fi". + --> + <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> + <analyzer type="index"> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <!-- in this example, we will only use synonyms at query time + <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> + --> + <!-- Case insensitive stop word removal. + add enablePositionIncrements=true in both the index and query + analyzers to leave a 'gap' for more accurate phrase queries. + --> + <filter class="solr.StopFilterFactory" + ignoreCase="true" + words="lang/stopwords_en.txt" + enablePositionIncrements="true" + /> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <filter class="solr.PorterStemFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.StopFilterFactory" + ignoreCase="true" + words="lang/stopwords_en.txt" + enablePositionIncrements="true" + /> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <filter class="solr.PorterStemFilterFactory"/> + </analyzer> + </fieldType> + + <!-- Less flexible matching, but less false matches. Probably not ideal for product names, + but may be good for SKUs. Can insert dashes in the wrong place and still match. --> + <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> + <analyzer> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <filter class="solr.EnglishMinimalStemFilterFactory"/> + <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes + possible with WordDelimiterFilter in conjuncton with stemming. --> + <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> + </analyzer> + </fieldType> + + <!-- Just like text_general except it reverses the characters of + each token, to enable more efficient leading wildcard queries. --> + <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100"> + <analyzer type="index"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" + maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + </fieldType> + + <!-- charFilter + WhitespaceTokenizer --> + <!-- + <fieldType name="text_char_norm" class="solr.TextField" positionIncrementGap="100" > + <analyzer> + <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + </analyzer> + </fieldType> + --> + + <!-- This is an example of using the KeywordTokenizer along + With various TokenFilterFactories to produce a sortable field + that does not include some properties of the source text + --> + <fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true"> + <analyzer> + <!-- KeywordTokenizer does no actual tokenizing, so the entire + input string is preserved as a single token + --> + <tokenizer class="solr.KeywordTokenizerFactory"/> + <!-- The LowerCase TokenFilter does what you expect, which can be + when you want your sorting to be case insensitive + --> + <filter class="solr.LowerCaseFilterFactory" /> + <!-- The TrimFilter removes any leading or trailing whitespace --> + <filter class="solr.TrimFilterFactory" /> + <!-- The PatternReplaceFilter gives you the flexibility to use + Java Regular expression to replace any sequence of characters + matching a pattern with an arbitrary replacement string, + which may include back references to portions of the original + string matched by the pattern. + + See the Java Regular Expression documentation for more + information on pattern and replacement string syntax. + + http://java.sun.com/j2se/1.6.0/docs/api/java/util/regex/package-summary.html + --> + <filter class="solr.PatternReplaceFilterFactory" + pattern="([^a-z])" replacement="" replace="all" + /> + </analyzer> + </fieldType> + + <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" > + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/> + </analyzer> + </fieldtype> + + <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" > + <analyzer> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <!-- + The DelimitedPayloadTokenFilter can put payloads on tokens... for example, + a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f + Attributes of the DelimitedPayloadTokenFilterFactory : + "delimiter" - a one character delimiter. Default is | (pipe) + "encoder" - how to encode the following value into a playload + float -> org.apache.lucene.analysis.payloads.FloatEncoder, + integer -> o.a.l.a.p.IntegerEncoder + identity -> o.a.l.a.p.IdentityEncoder + Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor. + --> + <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/> + </analyzer> + </fieldtype> + + <!-- lowercases the entire field value, keeping it as a single token. --> + <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.KeywordTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory" /> + </analyzer> + </fieldType> + + <!-- + Example of using PathHierarchyTokenizerFactory at index time, so + queries for paths match documents at that path, or in descendent paths + --> + <fieldType name="descendent_path" class="solr.TextField"> + <analyzer type="index"> + <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" /> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.KeywordTokenizerFactory" /> + </analyzer> + </fieldType> + <!-- + Example of using PathHierarchyTokenizerFactory at query time, so + queries for paths match documents at that path, or in ancestor paths + --> + <fieldType name="ancestor_path" class="solr.TextField"> + <analyzer type="index"> + <tokenizer class="solr.KeywordTokenizerFactory" /> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" /> + </analyzer> + </fieldType> + + <!-- since fields of this type are by default not stored or indexed, + any data added to them will be ignored outright. --> + <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" /> + + <!-- This point type indexes the coordinates as separate fields (subFields) + If subFieldType is defined, it references a type, and a dynamic field + definition is created matching *___<typename>. Alternately, if + subFieldSuffix is defined, that is used to create the subFields. + Example: if subFieldType="double", then the coordinates would be + indexed in fields myloc_0___double,myloc_1___double. + Example: if subFieldSuffix="_d" then the coordinates would be indexed + in fields myloc_0_d,myloc_1_d + The subFields are an implementation detail of the fieldType, and end + users normally should not need to know about them. + --> + <fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/> + + <!-- A specialized field for geospatial search. If indexed, this fieldType must not be multivalued. --> + <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/> + + <!-- + A Geohash is a compact representation of a latitude longitude pair in a single field. + See http://wiki.apache.org/solr/SpatialSearch + --> + <fieldtype name="geohash" class="solr.GeoHashField"/> + + <!-- Money/currency field type. See http://wiki.apache.org/solr/MoneyFieldType + Parameters: + defaultCurrency: Specifies the default currency if none specified. Defaults to "USD" + precisionStep: Specifies the precisionStep for the TrieLong field used for the amount + providerClass: Lets you plug in other exchange provider backend: + solr.FileExchangeRateProvider is the default and takes one parameter: + currencyConfig: name of an xml file holding exhange rates + solr.OpenExchangeRatesOrgProvider uses rates from openexchangerates.org: + ratesFileLocation: URL or path to rates JSON file (default latest.json on the web) + refreshInterval: Number of minutes between each rates fetch (default: 1440, min: 60) + --> +<!-- <fieldType name="currency" class="solr.CurrencyField" precisionStep="8" defaultCurrency="USD" currencyConfig="currency.xml" /> - nie dziala --> + + + + <!-- some examples for different languages (generally ordered by ISO code) --> + + <!-- Arabic --> + <fieldType name="text_ar" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <!-- for any non-arabic --> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ar.txt" enablePositionIncrements="true"/> + <!-- normalizes ﻯ to ï»±, etc --> + <filter class="solr.ArabicNormalizationFilterFactory"/> + <filter class="solr.ArabicStemFilterFactory"/> + </analyzer> + </fieldType> + + <!-- Bulgarian --> + <fieldType name="text_bg" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_bg.txt" enablePositionIncrements="true"/> + <filter class="solr.BulgarianStemFilterFactory"/> + </analyzer> + </fieldType> + + <!-- Catalan --> + <fieldType name="text_ca" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <!-- removes l', etc --> + <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ca.txt"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ca.txt" enablePositionIncrements="true"/> + <filter class="solr.SnowballPorterFilterFactory" language="Catalan"/> + </analyzer> + </fieldType> + + <!-- CJK bigram (see text_ja for a Japanese configuration using morphological analysis) --> + <fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <!-- normalize width before bigram, as e.g. half-width dakuten combine --> + <filter class="solr.CJKWidthFilterFactory"/> + <!-- for any non-CJK --> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.CJKBigramFilterFactory"/> + </analyzer> + </fieldType> + + <!-- Czech --> + <fieldType name="text_cz" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_cz.txt" enablePositionIncrements="true"/> + <filter class="solr.CzechStemFilterFactory"/> + </analyzer> + </fieldType> + + <!-- Danish --> + <fieldType name="text_da" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_da.txt" format="snowball" enablePositionIncrements="true"/> + <filter class="solr.SnowballPorterFilterFactory" language="Danish"/> + </analyzer> + </fieldType> + + <!-- German --> + <fieldType name="text_de" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" enablePositionIncrements="true"/> + <filter class="solr.GermanNormalizationFilterFactory"/> + <filter class="solr.GermanLightStemFilterFactory"/> + <!-- less aggressive: <filter class="solr.GermanMinimalStemFilterFactory"/> --> + <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="German2"/> --> + </analyzer> + </fieldType> + + <!-- Greek --> + <fieldType name="text_el" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <!-- greek specific lowercase for sigma --> + <filter class="solr.GreekLowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_el.txt" enablePositionIncrements="true"/> + <filter class="solr.GreekStemFilterFactory"/> + </analyzer> + </fieldType> + + <!-- Spanish --> + <fieldType name="text_es" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_es.txt" format="snowball" enablePositionIncrements="true"/> + <filter class="solr.SpanishLightStemFilterFactory"/> + <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Spanish"/> --> + </analyzer> + </fieldType> + + <!-- Basque --> + <fieldType name="text_eu" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_eu.txt" enablePositionIncrements="true"/> + <filter class="solr.SnowballPorterFilterFactory" language="Basque"/> + </analyzer> + </fieldType> + + <!-- Persian --> + <fieldType name="text_fa" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <!-- for ZWNJ --> + <charFilter class="solr.PersianCharFilterFactory"/> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.ArabicNormalizationFilterFactory"/> + <filter class="solr.PersianNormalizationFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fa.txt" enablePositionIncrements="true"/> + </analyzer> + </fieldType> + + <!-- Finnish --> + <fieldType name="text_fi" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball" enablePositionIncrements="true"/> + <filter class="solr.SnowballPorterFilterFactory" language="Finnish"/> + <!-- less aggressive: <filter class="solr.FinnishLightStemFilterFactory"/> --> + </analyzer> + </fieldType> + + <!-- French --> + <fieldType name="text_fr" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <!-- removes l', etc --> + <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball" enablePositionIncrements="true"/> + <filter class="solr.FrenchLightStemFilterFactory"/> + <!-- less aggressive: <filter class="solr.FrenchMinimalStemFilterFactory"/> --> + <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="French"/> --> + </analyzer> + </fieldType> + + <!-- Irish --> + <fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <!-- removes d', etc --> + <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt"/> + <!-- removes n-, etc. position increments is intentionally false! --> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt" enablePositionIncrements="false"/> + <filter class="solr.IrishLowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt" enablePositionIncrements="true"/> + <filter class="solr.SnowballPorterFilterFactory" language="Irish"/> + </analyzer> + </fieldType> + + <!-- Galician --> + <fieldType name="text_gl" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt" enablePositionIncrements="true"/> + <filter class="solr.GalicianStemFilterFactory"/> + <!-- less aggressive: <filter class="solr.GalicianMinimalStemFilterFactory"/> --> + </analyzer> + </fieldType> + + <!-- Hindi --> + <fieldType name="text_hi" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <!-- normalizes unicode representation --> + <filter class="solr.IndicNormalizationFilterFactory"/> + <!-- normalizes variation in spelling --> + <filter class="solr.HindiNormalizationFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt" enablePositionIncrements="true"/> + <filter class="solr.HindiStemFilterFactory"/> + </analyzer> + </fieldType> + + <!-- Hungarian --> + <fieldType name="text_hu" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball" enablePositionIncrements="true"/> + <filter class="solr.SnowballPorterFilterFactory" language="Hungarian"/> + <!-- less aggressive: <filter class="solr.HungarianLightStemFilterFactory"/> --> + </analyzer> + </fieldType> + + <!-- Armenian --> + <fieldType name="text_hy" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt" enablePositionIncrements="true"/> + <filter class="solr.SnowballPorterFilterFactory" language="Armenian"/> + </analyzer> + </fieldType> + + <!-- Indonesian --> + <fieldType name="text_id" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt" enablePositionIncrements="true"/> + <!-- for a less aggressive approach (only inflectional suffixes), set stemDerivational to false --> + <filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/> + </analyzer> + </fieldType> + + <!-- Italian --> + <fieldType name="text_it" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <!-- removes l', etc --> + <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_it.txt"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball" enablePositionIncrements="true"/> + <filter class="solr.ItalianLightStemFilterFactory"/> + <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> --> + </analyzer> + </fieldType> + + <!-- Japanese using morphological analysis (see text_cjk for a configuration using bigramming) + + NOTE: If you want to optimize search for precision, use default operator AND in your query + parser config with <solrQueryParser defaultOperator="AND"/> further down in this file. Use + OR if you would like to optimize for recall (default). + --> + <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false"> + <analyzer> + <!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer) + + Kuromoji has a search mode (default) that does segmentation useful for search. A heuristic + is used to segment compounds into its parts and the compound itself is kept as synonym. + + Valid values for attribute mode are: + normal: regular segmentation + search: segmentation useful for search with synonyms compounds (default) + extended: same as search mode, but unigrams unknown words (experimental) + + For some applications it might be good to use search mode for indexing and normal mode for + queries to reduce recall and prevent parts of compounds from being matched and highlighted. + Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query. + + Kuromoji also has a convenient user dictionary feature that allows overriding the statistical + model with your own entries for segmentation, part-of-speech tags and readings without a need + to specify weights. Notice that user dictionaries have not been subject to extensive testing. + + User dictionary attributes are: + userDictionary: user dictionary filename + userDictionaryEncoding: user dictionary encoding (default is UTF-8) + + See lang/userdict_ja.txt for a sample user dictionary file. + + Punctuation characters are discarded by default. Use discardPunctuation="false" to keep them. + + See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support. + --> + <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/> + <!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>--> + <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) --> + <filter class="solr.JapaneseBaseFormFilterFactory"/> + <!-- Removes tokens with certain part-of-speech tags --> + <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/> + <!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) --> + <filter class="solr.CJKWidthFilterFactory"/> + <!-- Removes common tokens typically not useful for search, but have a negative effect on ranking --> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" /> + <!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) --> + <filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/> + <!-- Lower-cases romaji characters --> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + </fieldType> + + <!-- Latvian --> + <fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt" enablePositionIncrements="true"/> + <filter class="solr.LatvianStemFilterFactory"/> + </analyzer> + </fieldType> + + <!-- Dutch --> + <fieldType name="text_nl" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball" enablePositionIncrements="true"/> + <filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict_nl.txt" ignoreCase="false"/> + <filter class="solr.SnowballPorterFilterFactory" language="Dutch"/> + </analyzer> + </fieldType> + + <!-- Norwegian --> + <fieldType name="text_no" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball" enablePositionIncrements="true"/> + <filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/> + <!-- less aggressive: <filter class="solr.NorwegianLightStemFilterFactory"/> --> + <!-- singular/plural: <filter class="solr.NorwegianMinimalStemFilterFactory"/> --> + </analyzer> + </fieldType> + + <!-- Polish --> + <fieldType name="text_pl" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pl.txt" format="snowball" enablePositionIncrements="true"/> + <filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" /> + <!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> --> + <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> --> + <!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> --> + </analyzer> + </fieldType> + + + <!-- Portuguese --> + <fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" enablePositionIncrements="true"/> + <filter class="solr.PortugueseLightStemFilterFactory"/> + <!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> --> + <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> --> + <!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> --> + </analyzer> + </fieldType> + + <!-- Romanian --> + <fieldType name="text_ro" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt" enablePositionIncrements="true"/> + <filter class="solr.SnowballPorterFilterFactory" language="Romanian"/> + </analyzer> + </fieldType> + + <!-- Russian --> + <fieldType name="text_ru" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" enablePositionIncrements="true"/> + <filter class="solr.SnowballPorterFilterFactory" language="Russian"/> + <!-- less aggressive: <filter class="solr.RussianLightStemFilterFactory"/> --> + </analyzer> + </fieldType> + + <!-- Swedish --> + <fieldType name="text_sv" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball" enablePositionIncrements="true"/> + <filter class="solr.SnowballPorterFilterFactory" language="Swedish"/> + <!-- less aggressive: <filter class="solr.SwedishLightStemFilterFactory"/> --> + </analyzer> + </fieldType> + + <!-- Thai --> + <fieldType name="text_th" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.ThaiWordFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt" enablePositionIncrements="true"/> + </analyzer> + </fieldType> + + <!-- Turkish --> + <fieldType name="text_tr" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.TurkishLowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" enablePositionIncrements="true"/> + <filter class="solr.SnowballPorterFilterFactory" language="Turkish"/> + </analyzer> + </fieldType> + + </types> + + <!-- Similarity is the scoring routine for each document vs. a query. + A custom Similarity or SimilarityFactory may be specified here, but + the default is fine for most applications. + For more info: http://wiki.apache.org/solr/SchemaXml#Similarity + --> + <!-- + <similarity class="com.example.solr.CustomSimilarityFactory"> + <str name="paramkey">param value</str> + </similarity> + --> + +</schema> diff --git a/doc/solrconfig.xml b/doc/solrconfig.xml new file mode 100644 index 000000000..a27c21383 --- /dev/null +++ b/doc/solrconfig.xml @@ -0,0 +1,1752 @@ +<?xml version="1.0" encoding="UTF-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<!-- + For more details about configurations options that may appear in + this file, see http://wiki.apache.org/solr/SolrConfigXml. +--> +<config> + <!-- In all configuration below, a prefix of "solr." for class names + is an alias that causes solr to search appropriate packages, + including org.apache.solr.(search|update|request|core|analysis) + + You may also specify a fully qualified Java classname if you + have your own custom plugins. + --> + + <!-- Controls what version of Lucene various components of Solr + adhere to. Generally, you want to use the latest version to + get all bug fixes and improvements. It is highly recommended + that you fully re-index after changing this setting as it can + affect both how text is indexed and queried. + --> + <luceneMatchVersion>LUCENE_40</luceneMatchVersion> + + <!-- lib directives can be used to instruct Solr to load an Jars + identified and use them to resolve any "plugins" specified in + your solrconfig.xml or schema.xml (ie: Analyzers, Request + Handlers, etc...). + + All directories and paths are resolved relative to the + instanceDir. + + If a "./lib" directory exists in your instanceDir, all files + found in it are included as if you had used the following + syntax... + + <lib dir="./lib" /> + --> + + <!-- A 'dir' option by itself adds any files found in the directory + to the classpath, this is useful for including all jars in a + directory. + --> + <!-- + <lib dir="../add-everything-found-in-this-dir-to-the-classpath" /> + --> + + <!-- When a 'regex' is specified in addition to a 'dir', only the + files in that directory which completely match the regex + (anchored on both ends) will be included. + --> + <lib dir="../../../dist/" regex="apache-solr-cell-\d.*\.jar" /> + <lib dir="../../../contrib/extraction/lib" regex=".*\.jar" /> + + <lib dir="../../../dist/" regex="apache-solr-clustering-\d.*\.jar" /> + <lib dir="../../../contrib/clustering/lib/" regex=".*\.jar" /> + + <lib dir="../../../dist/" regex="apache-solr-langid-\d.*\.jar" /> + <lib dir="../../../contrib/langid/lib/" regex=".*\.jar" /> + + <lib dir="../../../dist/" regex="apache-solr-velocity-\d.*\.jar" /> + <lib dir="../../../contrib/velocity/lib" regex=".*\.jar" /> + + <!-- If a 'dir' option (with or without a regex) is used and nothing + is found that matches, it will be ignored + --> + <lib dir="/total/crap/dir/ignored" /> + + <!-- an exact 'path' can be used instead of a 'dir' to specify a + specific file. This will cause a serious error to be logged if + it can't be loaded. + --> + <!-- + <lib path="../a-jar-that-does-not-exist.jar" /> + --> + + <!-- Data Directory + + Used to specify an alternate directory to hold all index data + other than the default ./data under the Solr home. If + replication is in use, this should match the replication + configuration. + --> + <dataDir>${solr.data.dir:}</dataDir> + + + <!-- The DirectoryFactory to use for indexes. + + solr.StandardDirectoryFactory is filesystem + based and tries to pick the best implementation for the current + JVM and platform. solr.NRTCachingDirectoryFactory, the default, + wraps solr.StandardDirectoryFactory and caches small files in memory + for better NRT performance. + + One can force a particular implementation via solr.MMapDirectoryFactory, + solr.NIOFSDirectoryFactory, or solr.SimpleFSDirectoryFactory. + + solr.RAMDirectoryFactory is memory based, not + persistent, and doesn't work with replication. + --> + <directoryFactory name="DirectoryFactory" + class="${solr.directoryFactory:solr.NRTCachingDirectoryFactory}"/> + + <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Index Config - These settings control low-level behavior of indexing + Most example settings here show the default value, but are commented + out, to more easily see where customizations have been made. + + Note: This replaces <indexDefaults> and <mainIndex> from older versions + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --> + <indexConfig> + <!-- maxFieldLength was removed in 4.0. To get similar behavior, include a + LimitTokenCountFilterFactory in your fieldType definition. E.g. + <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="10000"/> + --> + <!-- Maximum time to wait for a write lock (ms) for an IndexWriter. Default: 1000 --> + <!-- <writeLockTimeout>1000</writeLockTimeout> --> + + <!-- Expert: Enabling compound file will use less files for the index, + using fewer file descriptors on the expense of performance decrease. + Default in Lucene is "true". Default in Solr is "false" (since 3.6) --> + <!-- <useCompoundFile>false</useCompoundFile> --> + + <!-- ramBufferSizeMB sets the amount of RAM that may be used by Lucene + indexing for buffering added documents and deletions before they are + flushed to the Directory. + maxBufferedDocs sets a limit on the number of documents buffered + before flushing. + If both ramBufferSizeMB and maxBufferedDocs is set, then + Lucene will flush based on whichever limit is hit first. --> + <!-- <ramBufferSizeMB>32</ramBufferSizeMB> --> + <!-- <maxBufferedDocs>1000</maxBufferedDocs> --> + + <!-- Expert: Merge Policy + The Merge Policy in Lucene controls how merging of segments is done. + The default since Solr/Lucene 3.3 is TieredMergePolicy. + The default since Lucene 2.3 was the LogByteSizeMergePolicy, + Even older versions of Lucene used LogDocMergePolicy. + --> + <!-- + <mergePolicy class="org.apache.lucene.index.TieredMergePolicy"> + <int name="maxMergeAtOnce">10</int> + <int name="segmentsPerTier">10</int> + </mergePolicy> + --> + + <!-- Merge Factor + The merge factor controls how many segments will get merged at a time. + For TieredMergePolicy, mergeFactor is a convenience parameter which + will set both MaxMergeAtOnce and SegmentsPerTier at once. + For LogByteSizeMergePolicy, mergeFactor decides how many new segments + will be allowed before they are merged into one. + Default is 10 for both merge policies. + --> + <!-- + <mergeFactor>10</mergeFactor> + --> + + <!-- Expert: Merge Scheduler + The Merge Scheduler in Lucene controls how merges are + performed. The ConcurrentMergeScheduler (Lucene 2.3 default) + can perform merges in the background using separate threads. + The SerialMergeScheduler (Lucene 2.2 default) does not. + --> + <!-- + <mergeScheduler class="org.apache.lucene.index.ConcurrentMergeScheduler"/> + --> + + <!-- LockFactory + + This option specifies which Lucene LockFactory implementation + to use. + + single = SingleInstanceLockFactory - suggested for a + read-only index or when there is no possibility of + another process trying to modify the index. + native = NativeFSLockFactory - uses OS native file locking. + Do not use when multiple solr webapps in the same + JVM are attempting to share a single index. + simple = SimpleFSLockFactory - uses a plain file for locking + + Defaults: 'native' is default for Solr3.6 and later, otherwise + 'simple' is the default + + More details on the nuances of each LockFactory... + http://wiki.apache.org/lucene-java/AvailableLockFactories + --> + <!-- <lockType>native</lockType> --> + + <!-- Unlock On Startup + + If true, unlock any held write or commit locks on startup. + This defeats the locking mechanism that allows multiple + processes to safely access a lucene index, and should be used + with care. Default is "false". + + This is not needed if lock type is 'none' or 'single' + --> + <!-- + <unlockOnStartup>false</unlockOnStartup> + --> + + <!-- Expert: Controls how often Lucene loads terms into memory + Default is 128 and is likely good for most everyone. + --> + <!-- <termIndexInterval>128</termIndexInterval> --> + + <!-- If true, IndexReaders will be reopened (often more efficient) + instead of closed and then opened. Default: true + --> + <!-- + <reopenReaders>true</reopenReaders> + --> + + <!-- Commit Deletion Policy + + Custom deletion policies can be specified here. The class must + implement org.apache.lucene.index.IndexDeletionPolicy. + + http://lucene.apache.org/java/3_5_0/api/core/org/apache/lucene/index/IndexDeletionPolicy.html + + The default Solr IndexDeletionPolicy implementation supports + deleting index commit points on number of commits, age of + commit point and optimized status. + + The latest commit point should always be preserved regardless + of the criteria. + --> + <!-- + <deletionPolicy class="solr.SolrDeletionPolicy"> + --> + <!-- The number of commit points to be kept --> + <!-- <str name="maxCommitsToKeep">1</str> --> + <!-- The number of optimized commit points to be kept --> + <!-- <str name="maxOptimizedCommitsToKeep">0</str> --> + <!-- + Delete all commit points once they have reached the given age. + Supports DateMathParser syntax e.g. + --> + <!-- + <str name="maxCommitAge">30MINUTES</str> + <str name="maxCommitAge">1DAY</str> + --> + <!-- + </deletionPolicy> + --> + + <!-- Lucene Infostream + + To aid in advanced debugging, Lucene provides an "InfoStream" + of detailed information when indexing. + + Setting The value to true will instruct the underlying Lucene + IndexWriter to write its debugging info the specified file + --> + <!-- <infoStream file="INFOSTREAM.txt">false</infoStream> --> + </indexConfig> + + + <!-- JMX + + This example enables JMX if and only if an existing MBeanServer + is found, use this if you want to configure JMX through JVM + parameters. Remove this to disable exposing Solr configuration + and statistics to JMX. + + For more details see http://wiki.apache.org/solr/SolrJmx + --> + <jmx /> + <!-- If you want to connect to a particular server, specify the + agentId + --> + <!-- <jmx agentId="myAgent" /> --> + <!-- If you want to start a new MBeanServer, specify the serviceUrl --> + <!-- <jmx serviceUrl="service:jmx:rmi:///jndi/rmi://localhost:9999/solr"/> + --> + + <!-- The default high-performance update handler --> + <updateHandler class="solr.DirectUpdateHandler2"> + + <!-- AutoCommit + + Perform a hard commit automatically under certain conditions. + Instead of enabling autoCommit, consider using "commitWithin" + when adding documents. + + http://wiki.apache.org/solr/UpdateXmlMessages + + maxDocs - Maximum number of documents to add since the last + commit before automatically triggering a new commit. + + maxTime - Maximum amount of time in ms that is allowed to pass + since a document was added before automaticly + triggering a new commit. + openSearcher - if false, the commit causes recent index changes + to be flushed to stable storage, but does not cause a new + searcher to be opened to make those changes visible. + --> + <autoCommit> + <maxTime>15000</maxTime> + <openSearcher>false</openSearcher> + </autoCommit> + + <!-- softAutoCommit is like autoCommit except it causes a + 'soft' commit which only ensures that changes are visible + but does not ensure that data is synced to disk. This is + faster and more near-realtime friendly than a hard commit. + --> + <!-- + <autoSoftCommit> + <maxTime>1000</maxTime> + </autoSoftCommit> + --> + + <!-- Update Related Event Listeners + + Various IndexWriter related events can trigger Listeners to + take actions. + + postCommit - fired after every commit or optimize command + postOptimize - fired after every optimize command + --> + <!-- The RunExecutableListener executes an external command from a + hook such as postCommit or postOptimize. + + exe - the name of the executable to run + dir - dir to use as the current working directory. (default=".") + wait - the calling thread waits until the executable returns. + (default="true") + args - the arguments to pass to the program. (default is none) + env - environment variables to set. (default is none) + --> + <!-- This example shows how RunExecutableListener could be used + with the script based replication... + http://wiki.apache.org/solr/CollectionDistribution + --> + <!-- + <listener event="postCommit" class="solr.RunExecutableListener"> + <str name="exe">solr/bin/snapshooter</str> + <str name="dir">.</str> + <bool name="wait">true</bool> + <arr name="args"> <str>arg1</str> <str>arg2</str> </arr> + <arr name="env"> <str>MYVAR=val1</str> </arr> + </listener> + --> + + <!-- Enables a transaction log, currently used for real-time get. + "dir" - the target directory for transaction logs, defaults to the + solr data directory. --> + <updateLog> + <str name="dir">${solr.data.dir:}</str> + </updateLog> + + + </updateHandler> + + <!-- IndexReaderFactory + + Use the following format to specify a custom IndexReaderFactory, + which allows for alternate IndexReader implementations. + + ** Experimental Feature ** + + Please note - Using a custom IndexReaderFactory may prevent + certain other features from working. The API to + IndexReaderFactory may change without warning or may even be + removed from future releases if the problems cannot be + resolved. + + + ** Features that may not work with custom IndexReaderFactory ** + + The ReplicationHandler assumes a disk-resident index. Using a + custom IndexReader implementation may cause incompatibility + with ReplicationHandler and may cause replication to not work + correctly. See SOLR-1366 for details. + + --> + <!-- + <indexReaderFactory name="IndexReaderFactory" class="package.class"> + <str name="someArg">Some Value</str> + </indexReaderFactory > + --> + <!-- By explicitly declaring the Factory, the termIndexDivisor can + be specified. + --> + <!-- + <indexReaderFactory name="IndexReaderFactory" + class="solr.StandardIndexReaderFactory"> + <int name="setTermIndexDivisor">12</int> + </indexReaderFactory > + --> + + <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Query section - these settings control query time things like caches + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --> + <query> + <!-- Max Boolean Clauses + + Maximum number of clauses in each BooleanQuery, an exception + is thrown if exceeded. + + ** WARNING ** + + This option actually modifies a global Lucene property that + will affect all SolrCores. If multiple solrconfig.xml files + disagree on this property, the value at any given moment will + be based on the last SolrCore to be initialized. + + --> + <maxBooleanClauses>1024</maxBooleanClauses> + + + <!-- Solr Internal Query Caches + + There are two implementations of cache available for Solr, + LRUCache, based on a synchronized LinkedHashMap, and + FastLRUCache, based on a ConcurrentHashMap. + + FastLRUCache has faster gets and slower puts in single + threaded operation and thus is generally faster than LRUCache + when the hit ratio of the cache is high (> 75%), and may be + faster under other scenarios on multi-cpu systems. + --> + + <!-- Filter Cache + + Cache used by SolrIndexSearcher for filters (DocSets), + unordered sets of *all* documents that match a query. When a + new searcher is opened, its caches may be prepopulated or + "autowarmed" using data from caches in the old searcher. + autowarmCount is the number of items to prepopulate. For + LRUCache, the autowarmed items will be the most recently + accessed items. + + Parameters: + class - the SolrCache implementation LRUCache or + (LRUCache or FastLRUCache) + size - the maximum number of entries in the cache + initialSize - the initial capacity (number of entries) of + the cache. (see java.util.HashMap) + autowarmCount - the number of entries to prepopulate from + and old cache. + --> + <filterCache class="solr.FastLRUCache" + size="512" + initialSize="512" + autowarmCount="0"/> + + <!-- Query Result Cache + + Caches results of searches - ordered lists of document ids + (DocList) based on a query, a sort, and the range of documents requested. + --> + <queryResultCache class="solr.LRUCache" + size="512" + initialSize="512" + autowarmCount="0"/> + + <!-- Document Cache + + Caches Lucene Document objects (the stored fields for each + document). Since Lucene internal document ids are transient, + this cache will not be autowarmed. + --> + <documentCache class="solr.LRUCache" + size="512" + initialSize="512" + autowarmCount="0"/> + + <!-- Field Value Cache + + Cache used to hold field values that are quickly accessible + by document id. The fieldValueCache is created by default + even if not configured here. + --> + <!-- + <fieldValueCache class="solr.FastLRUCache" + size="512" + autowarmCount="128" + showItems="32" /> + --> + + <!-- Custom Cache + + Example of a generic cache. These caches may be accessed by + name through SolrIndexSearcher.getCache(),cacheLookup(), and + cacheInsert(). The purpose is to enable easy caching of + user/application level data. The regenerator argument should + be specified as an implementation of solr.CacheRegenerator + if autowarming is desired. + --> + <!-- + <cache name="myUserCache" + class="solr.LRUCache" + size="4096" + initialSize="1024" + autowarmCount="1024" + regenerator="com.mycompany.MyRegenerator" + /> + --> + + + <!-- Lazy Field Loading + + If true, stored fields that are not requested will be loaded + lazily. This can result in a significant speed improvement + if the usual case is to not load all stored fields, + especially if the skipped fields are large compressed text + fields. + --> + <enableLazyFieldLoading>true</enableLazyFieldLoading> + + <!-- Use Filter For Sorted Query + + A possible optimization that attempts to use a filter to + satisfy a search. If the requested sort does not include + score, then the filterCache will be checked for a filter + matching the query. If found, the filter will be used as the + source of document ids, and then the sort will be applied to + that. + + For most situations, this will not be useful unless you + frequently get the same search repeatedly with different sort + options, and none of them ever use "score" + --> + <!-- + <useFilterForSortedQuery>true</useFilterForSortedQuery> + --> + + <!-- Result Window Size + + An optimization for use with the queryResultCache. When a search + is requested, a superset of the requested number of document ids + are collected. For example, if a search for a particular query + requests matching documents 10 through 19, and queryWindowSize is 50, + then documents 0 through 49 will be collected and cached. Any further + requests in that range can be satisfied via the cache. + --> + <queryResultWindowSize>20</queryResultWindowSize> + + <!-- Maximum number of documents to cache for any entry in the + queryResultCache. + --> + <queryResultMaxDocsCached>200</queryResultMaxDocsCached> + + <!-- Query Related Event Listeners + + Various IndexSearcher related events can trigger Listeners to + take actions. + + newSearcher - fired whenever a new searcher is being prepared + and there is a current searcher handling requests (aka + registered). It can be used to prime certain caches to + prevent long request times for certain requests. + + firstSearcher - fired whenever a new searcher is being + prepared but there is no current registered searcher to handle + requests or to gain autowarming data from. + + + --> + <!-- QuerySenderListener takes an array of NamedList and executes a + local query request for each NamedList in sequence. + --> + <listener event="newSearcher" class="solr.QuerySenderListener"> + <arr name="queries"> + <!-- + <lst><str name="q">solr</str><str name="sort">price asc</str></lst> + <lst><str name="q">rocks</str><str name="sort">weight asc</str></lst> + --> + </arr> + </listener> + <listener event="firstSearcher" class="solr.QuerySenderListener"> + <arr name="queries"> + <lst> + <str name="q">static firstSearcher warming in solrconfig.xml</str> + </lst> + </arr> + </listener> + + <!-- Use Cold Searcher + + If a search request comes in and there is no current + registered searcher, then immediately register the still + warming searcher and use it. If "false" then all requests + will block until the first searcher is done warming. + --> + <useColdSearcher>false</useColdSearcher> + + <!-- Max Warming Searchers + + Maximum number of searchers that may be warming in the + background concurrently. An error is returned if this limit + is exceeded. + + Recommend values of 1-2 for read-only slaves, higher for + masters w/o cache warming. + --> + <maxWarmingSearchers>2</maxWarmingSearchers> + + </query> + + + <!-- Request Dispatcher + + This section contains instructions for how the SolrDispatchFilter + should behave when processing requests for this SolrCore. + + handleSelect is a legacy option that affects the behavior of requests + such as /select?qt=XXX + + handleSelect="true" will cause the SolrDispatchFilter to process + the request and dispatch the query to a handler specified by the + "qt" param, assuming "/select" isn't already registered. + + handleSelect="false" will cause the SolrDispatchFilter to + ignore "/select" requests, resulting in a 404 unless a handler + is explicitly registered with the name "/select" + + handleSelect="true" is not recommended for new users, but is the default + for backwards compatibility + --> + <requestDispatcher handleSelect="false" > + <!-- Request Parsing + + These settings indicate how Solr Requests may be parsed, and + what restrictions may be placed on the ContentStreams from + those requests + + enableRemoteStreaming - enables use of the stream.file + and stream.url parameters for specifying remote streams. + + multipartUploadLimitInKB - specifies the max size of + Multipart File Uploads that Solr will allow in a Request. + + *** WARNING *** + The settings below authorize Solr to fetch remote files, You + should make sure your system has some authentication before + using enableRemoteStreaming="true" + + --> + <requestParsers enableRemoteStreaming="true" + multipartUploadLimitInKB="2048000" /> + + <!-- HTTP Caching + + Set HTTP caching related parameters (for proxy caches and clients). + + The options below instruct Solr not to output any HTTP Caching + related headers + --> + <httpCaching never304="true" /> + <!-- If you include a <cacheControl> directive, it will be used to + generate a Cache-Control header (as well as an Expires header + if the value contains "max-age=") + + By default, no Cache-Control header is generated. + + You can use the <cacheControl> option even if you have set + never304="true" + --> + <!-- + <httpCaching never304="true" > + <cacheControl>max-age=30, public</cacheControl> + </httpCaching> + --> + <!-- To enable Solr to respond with automatically generated HTTP + Caching headers, and to response to Cache Validation requests + correctly, set the value of never304="false" + + This will cause Solr to generate Last-Modified and ETag + headers based on the properties of the Index. + + The following options can also be specified to affect the + values of these headers... + + lastModFrom - the default value is "openTime" which means the + Last-Modified value (and validation against If-Modified-Since + requests) will all be relative to when the current Searcher + was opened. You can change it to lastModFrom="dirLastMod" if + you want the value to exactly correspond to when the physical + index was last modified. + + etagSeed="..." is an option you can change to force the ETag + header (and validation against If-None-Match requests) to be + different even if the index has not changed (ie: when making + significant changes to your config file) + + (lastModifiedFrom and etagSeed are both ignored if you use + the never304="true" option) + --> + <!-- + <httpCaching lastModifiedFrom="openTime" + etagSeed="Solr"> + <cacheControl>max-age=30, public</cacheControl> + </httpCaching> + --> + </requestDispatcher> + + <!-- Request Handlers + + http://wiki.apache.org/solr/SolrRequestHandler + + Incoming queries will be dispatched to a specific handler by name + based on the path specified in the request. + + Legacy behavior: If the request path uses "/select" but no Request + Handler has that name, and if handleSelect="true" has been specified in + the requestDispatcher, then the Request Handler is dispatched based on + the qt parameter. Handlers without a leading '/' are accessed this way + like so: http://host/app/[core/]select?qt=name If no qt is + given, then the requestHandler that declares default="true" will be + used or the one named "standard". + + If a Request Handler is declared with startup="lazy", then it will + not be initialized until the first request that uses it. + + --> + <!-- SearchHandler + + http://wiki.apache.org/solr/SearchHandler + + For processing Search Queries, the primary Request Handler + provided with Solr is "SearchHandler" It delegates to a sequent + of SearchComponents (see below) and supports distributed + queries across multiple shards + --> + <requestHandler name="/select" class="solr.SearchHandler"> + <!-- default values for query parameters can be specified, these + will be overridden by parameters in the request + --> + <lst name="defaults"> + <str name="echoParams">explicit</str> + <int name="rows">50</int> + <str name="df">text</str> + <bool name="tv">true</bool> + </lst> + <!-- In addition to defaults, "appends" params can be specified + to identify values which should be appended to the list of + multi-val params from the query (or the existing "defaults"). + --> + <!-- In this example, the param "fq=instock:true" would be appended to + any query time fq params the user may specify, as a mechanism for + partitioning the index, independent of any user selected filtering + that may also be desired (perhaps as a result of faceted searching). + + NOTE: there is *absolutely* nothing a client can do to prevent these + "appends" values from being used, so don't use this mechanism + unless you are sure you always want it. + --> + <!-- + <lst name="appends"> + <str name="fq">inStock:true</str> + </lst> + --> + <!-- "invariants" are a way of letting the Solr maintainer lock down + the options available to Solr clients. Any params values + specified here are used regardless of what values may be specified + in either the query, the "defaults", or the "appends" params. + + In this example, the facet.field and facet.query params would + be fixed, limiting the facets clients can use. Faceting is + not turned on by default - but if the client does specify + facet=true in the request, these are the only facets they + will be able to see counts for; regardless of what other + facet.field or facet.query params they may specify. + + NOTE: there is *absolutely* nothing a client can do to prevent these + "invariants" values from being used, so don't use this mechanism + unless you are sure you always want it. + --> + <!-- + <lst name="invariants"> + <str name="facet.field">cat</str> + <str name="facet.field">manu_exact</str> + <str name="facet.query">price:[* TO 500]</str> + <str name="facet.query">price:[500 TO *]</str> + </lst> + --> + <!-- If the default list of SearchComponents is not desired, that + list can either be overridden completely, or components can be + prepended or appended to the default list. (see below) + --> + <!-- + <arr name="components"> + <str>nameOfCustomComponent1</str> + <str>nameOfCustomComponent2</str> + </arr> + --> + <arr name="last-components"> + <str>tvComponent</str> + </arr> + </requestHandler> + + <!-- A request handler that returns indented JSON by default --> + <requestHandler name="/query" class="solr.SearchHandler"> + <lst name="defaults"> + <str name="echoParams">explicit</str> + <str name="wt">json</str> + <str name="indent">true</str> + <str name="df">text</str> + </lst> + </requestHandler> + + + <!-- realtime get handler, guaranteed to return the latest stored fields of + any document, without the need to commit or open a new searcher. The + current implementation relies on the updateLog feature being enabled. --> + <requestHandler name="/get" class="solr.RealTimeGetHandler"> + <lst name="defaults"> + <str name="omitHeader">true</str> + <str name="wt">json</str> + <str name="indent">true</str> + </lst> + </requestHandler> + + + <!-- A Robust Example + + This example SearchHandler declaration shows off usage of the + SearchHandler with many defaults declared + + Note that multiple instances of the same Request Handler + (SearchHandler) can be registered multiple times with different + names (and different init parameters) + --> + <requestHandler name="/browse" class="solr.SearchHandler"> + <lst name="defaults"> + <str name="echoParams">explicit</str> + + <!-- VelocityResponseWriter settings --> + <str name="wt">velocity</str> + <str name="v.template">browse</str> + <str name="v.layout">layout</str> + <str name="title">Solritas</str> + + <!-- Query settings --> + <str name="defType">edismax</str> + <str name="qf"> + text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 + title^10.0 description^5.0 keywords^5.0 author^2.0 resourcename^1.0 + </str> + <str name="df">text</str> + <str name="mm">100%</str> + <str name="q.alt">*:*</str> + <str name="rows">10</str> + <str name="fl">*,score</str> + + <str name="mlt.qf"> + text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 + title^10.0 description^5.0 keywords^5.0 author^2.0 resourcename^1.0 + </str> + <str name="mlt.fl">text,features,name,sku,id,manu,cat,title,description,keywords,author,resourcename</str> + <int name="mlt.count">3</int> + + <!-- Faceting defaults --> + <str name="facet">on</str> + <str name="facet.field">cat</str> + <str name="facet.field">manu_exact</str> + <str name="facet.field">content_type</str> + <str name="facet.field">author_s</str> + <str name="facet.query">ipod</str> + <str name="facet.query">GB</str> + <str name="facet.mincount">1</str> + <str name="facet.pivot">cat,inStock</str> + <str name="facet.range.other">after</str> + <str name="facet.range">price</str> + <int name="f.price.facet.range.start">0</int> + <int name="f.price.facet.range.end">600</int> + <int name="f.price.facet.range.gap">50</int> + <str name="facet.range">popularity</str> + <int name="f.popularity.facet.range.start">0</int> + <int name="f.popularity.facet.range.end">10</int> + <int name="f.popularity.facet.range.gap">3</int> + <str name="facet.range">manufacturedate_dt</str> + <str name="f.manufacturedate_dt.facet.range.start">NOW/YEAR-10YEARS</str> + <str name="f.manufacturedate_dt.facet.range.end">NOW</str> + <str name="f.manufacturedate_dt.facet.range.gap">+1YEAR</str> + <str name="f.manufacturedate_dt.facet.range.other">before</str> + <str name="f.manufacturedate_dt.facet.range.other">after</str> + + <!-- Highlighting defaults --> + <str name="hl">on</str> + <str name="hl.fl">content features title name</str> + <str name="hl.encoder">html</str> + <str name="hl.simple.pre"><b></str> + <str name="hl.simple.post"></b></str> + <str name="f.title.hl.fragsize">0</str> + <str name="f.title.hl.alternateField">title</str> + <str name="f.name.hl.fragsize">0</str> + <str name="f.name.hl.alternateField">name</str> + <str name="f.content.hl.snippets">3</str> + <str name="f.content.hl.fragsize">200</str> + <str name="f.content.hl.alternateField">content</str> + <str name="f.content.hl.maxAlternateFieldLength">750</str> + + <!-- Spell checking defaults --> + <str name="spellcheck">on</str> + <str name="spellcheck.extendedResults">false</str> + <str name="spellcheck.count">5</str> + <str name="spellcheck.alternativeTermCount">2</str> + <str name="spellcheck.maxResultsForSuggest">5</str> + <str name="spellcheck.collate">true</str> + <str name="spellcheck.collateExtendedResults">true</str> + <str name="spellcheck.maxCollationTries">5</str> + <str name="spellcheck.maxCollations">3</str> + </lst> + + <!-- append spellchecking to our list of components --> + <arr name="last-components"> + <str>spellcheck</str> + </arr> + </requestHandler> + + + <!-- Update Request Handler. + + http://wiki.apache.org/solr/UpdateXmlMessages + + The canonical Request Handler for Modifying the Index through + commands specified using XML, JSON, CSV, or JAVABIN + + Note: Since solr1.1 requestHandlers requires a valid content + type header if posted in the body. For example, curl now + requires: -H 'Content-type:text/xml; charset=utf-8' + + To override the request content type and force a specific + Content-type, use the request parameter: + ?update.contentType=text/csv + + This handler will pick a response format to match the input + if the 'wt' parameter is not explicit + --> + <requestHandler name="/update" class="solr.UpdateRequestHandler"> + <!-- See below for information on defining + updateRequestProcessorChains that can be used by name + on each Update Request + --> + <!-- + <lst name="defaults"> + <str name="update.chain">dedupe</str> + </lst> + --> + </requestHandler> + + + <!-- Solr Cell Update Request Handler + + http://wiki.apache.org/solr/ExtractingRequestHandler + + --> + <requestHandler name="/update/extract" + startup="lazy" + class="solr.extraction.ExtractingRequestHandler" > + <lst name="defaults"> + <str name="lowernames">true</str> + <str name="uprefix">ignored_</str> + + <!-- capture link hrefs but ignore div attributes --> + <str name="captureAttr">true</str> + <str name="fmap.a">links</str> + <str name="fmap.div">ignored_</str> + </lst> + </requestHandler> + + + <!-- Field Analysis Request Handler + + RequestHandler that provides much the same functionality as + analysis.jsp. Provides the ability to specify multiple field + types and field names in the same request and outputs + index-time and query-time analysis for each of them. + + Request parameters are: + analysis.fieldname - field name whose analyzers are to be used + + analysis.fieldtype - field type whose analyzers are to be used + analysis.fieldvalue - text for index-time analysis + q (or analysis.q) - text for query time analysis + analysis.showmatch (true|false) - When set to true and when + query analysis is performed, the produced tokens of the + field value analysis will be marked as "matched" for every + token that is produces by the query analysis + --> + <requestHandler name="/analysis/field" + startup="lazy" + class="solr.FieldAnalysisRequestHandler" /> + + + <!-- Document Analysis Handler + + http://wiki.apache.org/solr/AnalysisRequestHandler + + An analysis handler that provides a breakdown of the analysis + process of provided documents. This handler expects a (single) + content stream with the following format: + + <docs> + <doc> + <field name="id">1</field> + <field name="name">The Name</field> + <field name="text">The Text Value</field> + </doc> + <doc>...</doc> + <doc>...</doc> + ... + </docs> + + Note: Each document must contain a field which serves as the + unique key. This key is used in the returned response to associate + an analysis breakdown to the analyzed document. + + Like the FieldAnalysisRequestHandler, this handler also supports + query analysis by sending either an "analysis.query" or "q" + request parameter that holds the query text to be analyzed. It + also supports the "analysis.showmatch" parameter which when set to + true, all field tokens that match the query tokens will be marked + as a "match". + --> + <requestHandler name="/analysis/document" + class="solr.DocumentAnalysisRequestHandler" + startup="lazy" /> + + <!-- Admin Handlers + + Admin Handlers - This will register all the standard admin + RequestHandlers. + --> + <requestHandler name="/admin/" + class="solr.admin.AdminHandlers" /> + <!-- This single handler is equivalent to the following... --> + <!-- + <requestHandler name="/admin/luke" class="solr.admin.LukeRequestHandler" /> + <requestHandler name="/admin/system" class="solr.admin.SystemInfoHandler" /> + <requestHandler name="/admin/plugins" class="solr.admin.PluginInfoHandler" /> + <requestHandler name="/admin/threads" class="solr.admin.ThreadDumpHandler" /> + <requestHandler name="/admin/properties" class="solr.admin.PropertiesRequestHandler" /> + <requestHandler name="/admin/file" class="solr.admin.ShowFileRequestHandler" > + --> + <!-- If you wish to hide files under ${solr.home}/conf, explicitly + register the ShowFileRequestHandler using: + --> + <!-- + <requestHandler name="/admin/file" + class="solr.admin.ShowFileRequestHandler" > + <lst name="invariants"> + <str name="hidden">synonyms.txt</str> + <str name="hidden">anotherfile.txt</str> + </lst> + </requestHandler> + --> + + <!-- ping/healthcheck --> + <requestHandler name="/admin/ping" class="solr.PingRequestHandler"> + <lst name="invariants"> + <str name="q">solrpingquery</str> + </lst> + <lst name="defaults"> + <str name="echoParams">all</str> + </lst> + <!-- An optional feature of the PingRequestHandler is to configure the + handler with a "healthcheckFile" which can be used to enable/disable + the PingRequestHandler. + relative paths are resolved against the data dir + --> + <!-- <str name="healthcheckFile">server-enabled.txt</str> --> + </requestHandler> + + <!-- Echo the request contents back to the client --> + <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" > + <lst name="defaults"> + <str name="echoParams">explicit</str> + <str name="echoHandler">true</str> + </lst> + </requestHandler> + + <!-- Solr Replication + + The SolrReplicationHandler supports replicating indexes from a + "master" used for indexing and "slaves" used for queries. + + http://wiki.apache.org/solr/SolrReplication + + In the example below, remove the <lst name="master"> section if + this is just a slave and remove the <lst name="slave"> section + if this is just a master. + --> + <!-- + <requestHandler name="/replication" class="solr.ReplicationHandler" > + <lst name="master"> + <str name="replicateAfter">commit</str> + <str name="replicateAfter">startup</str> + <str name="confFiles">schema.xml,stopwords.txt</str> + </lst> + <lst name="slave"> + <str name="masterUrl">http://localhost:8983/solr</str> + <str name="pollInterval">00:00:60</str> + </lst> + </requestHandler> + --> + + <!-- Solr Replication for SolrCloud Recovery + + This is the config need for SolrCloud's recovery replication. + --> + <requestHandler name="/replication" class="solr.ReplicationHandler" startup="lazy" /> + + + <!-- Search Components + + Search components are registered to SolrCore and used by + instances of SearchHandler (which can access them by name) + + By default, the following components are available: + + <searchComponent name="query" class="solr.QueryComponent" /> + <searchComponent name="facet" class="solr.FacetComponent" /> + <searchComponent name="mlt" class="solr.MoreLikeThisComponent" /> + <searchComponent name="highlight" class="solr.HighlightComponent" /> + <searchComponent name="stats" class="solr.StatsComponent" /> + <searchComponent name="debug" class="solr.DebugComponent" /> + + Default configuration in a requestHandler would look like: + + <arr name="components"> + <str>query</str> + <str>facet</str> + <str>mlt</str> + <str>highlight</str> + <str>stats</str> + <str>debug</str> + </arr> + + If you register a searchComponent to one of the standard names, + that will be used instead of the default. + + To insert components before or after the 'standard' components, use: + + <arr name="first-components"> + <str>myFirstComponentName</str> + </arr> + + <arr name="last-components"> + <str>myLastComponentName</str> + </arr> + + NOTE: The component registered with the name "debug" will + always be executed after the "last-components" + + --> + + <!-- Spell Check + + The spell check component can return a list of alternative spelling + suggestions. + + http://wiki.apache.org/solr/SpellCheckComponent + --> + <searchComponent name="spellcheck" class="solr.SpellCheckComponent"> + + <str name="queryAnalyzerFieldType">textSpell</str> + + <!-- Multiple "Spell Checkers" can be declared and used by this + component + --> + + <!-- a spellchecker built from a field of the main index --> + <lst name="spellchecker"> + <str name="name">default</str> + <str name="field">name</str> + <str name="classname">solr.DirectSolrSpellChecker</str> + <!-- the spellcheck distance measure used, the default is the internal levenshtein --> + <str name="distanceMeasure">internal</str> + <!-- minimum accuracy needed to be considered a valid spellcheck suggestion --> + <float name="accuracy">0.5</float> + <!-- the maximum #edits we consider when enumerating terms: can be 1 or 2 --> + <int name="maxEdits">2</int> + <!-- the minimum shared prefix when enumerating terms --> + <int name="minPrefix">1</int> + <!-- maximum number of inspections per result. --> + <int name="maxInspections">5</int> + <!-- minimum length of a query term to be considered for correction --> + <int name="minQueryLength">4</int> + <!-- maximum threshold of documents a query term can appear to be considered for correction --> + <float name="maxQueryFrequency">0.01</float> + <!-- uncomment this to require suggestions to occur in 1% of the documents + <float name="thresholdTokenFrequency">.01</float> + --> + </lst> + + <!-- a spellchecker that can break or combine words. See "/spell" handler below for usage --> + <lst name="spellchecker"> + <str name="name">wordbreak</str> + <str name="classname">solr.WordBreakSolrSpellChecker</str> + <str name="field">name</str> + <str name="combineWords">true</str> + <str name="breakWords">true</str> + <int name="maxChanges">10</int> + </lst> + + <!-- a spellchecker that uses a different distance measure --> + <!-- + <lst name="spellchecker"> + <str name="name">jarowinkler</str> + <str name="field">spell</str> + <str name="classname">solr.DirectSolrSpellChecker</str> + <str name="distanceMeasure"> + org.apache.lucene.search.spell.JaroWinklerDistance + </str> + </lst> + --> + + <!-- a spellchecker that use an alternate comparator + + comparatorClass be one of: + 1. score (default) + 2. freq (Frequency first, then score) + 3. A fully qualified class name + --> + <!-- + <lst name="spellchecker"> + <str name="name">freq</str> + <str name="field">lowerfilt</str> + <str name="classname">solr.DirectSolrSpellChecker</str> + <str name="comparatorClass">freq</str> + --> + + <!-- A spellchecker that reads the list of words from a file --> + <!-- + <lst name="spellchecker"> + <str name="classname">solr.FileBasedSpellChecker</str> + <str name="name">file</str> + <str name="sourceLocation">spellings.txt</str> + <str name="characterEncoding">UTF-8</str> + <str name="spellcheckIndexDir">spellcheckerFile</str> + </lst> + --> + </searchComponent> + + <!-- A request handler for demonstrating the spellcheck component. + + NOTE: This is purely as an example. The whole purpose of the + SpellCheckComponent is to hook it into the request handler that + handles your normal user queries so that a separate request is + not needed to get suggestions. + + IN OTHER WORDS, THERE IS REALLY GOOD CHANCE THE SETUP BELOW IS + NOT WHAT YOU WANT FOR YOUR PRODUCTION SYSTEM! + + See http://wiki.apache.org/solr/SpellCheckComponent for details + on the request parameters. + --> + <requestHandler name="/spell" class="solr.SearchHandler" startup="lazy"> + <lst name="defaults"> + <str name="df">text</str> + <!-- Solr will use suggestions from both the 'default' spellchecker + and from the 'wordbreak' spellchecker and combine them. + collations (re-written queries) can include a combination of + corrections from both spellcheckers --> + <str name="spellcheck.dictionary">default</str> + <str name="spellcheck.dictionary">wordbreak</str> + <str name="spellcheck">on</str> + <str name="spellcheck.extendedResults">true</str> + <str name="spellcheck.count">10</str> + <str name="spellcheck.alternativeTermCount">5</str> + <str name="spellcheck.maxResultsForSuggest">5</str> + <str name="spellcheck.collate">true</str> + <str name="spellcheck.collateExtendedResults">true</str> + <str name="spellcheck.maxCollationTries">10</str> + <str name="spellcheck.maxCollations">5</str> + </lst> + <arr name="last-components"> + <str>spellcheck</str> + </arr> + </requestHandler> + + <!-- Term Vector Component + + http://wiki.apache.org/solr/TermVectorComponent + --> + <searchComponent name="tvComponent" class="solr.TermVectorComponent"/> + + <!-- A request handler for demonstrating the term vector component + + This is purely as an example. + + In reality you will likely want to add the component to your + already specified request handlers. + --> + <requestHandler name="/tvrh" class="solr.SearchHandler" startup="lazy"> + <lst name="defaults"> + <str name="df">text</str> + <bool name="tv">true</bool> + </lst> + <arr name="last-components"> + <str>tvComponent</str> + </arr> + </requestHandler> + + <!-- Clustering Component + + http://wiki.apache.org/solr/ClusteringComponent + + You'll need to set the solr.cluster.enabled system property + when running solr to run with clustering enabled: + + java -Dsolr.clustering.enabled=true -jar start.jar + + --> + <searchComponent name="clustering" + enable="${solr.clustering.enabled:false}" + class="solr.clustering.ClusteringComponent" > + <!-- Declare an engine --> + <lst name="engine"> + <!-- The name, only one can be named "default" --> + <str name="name">default</str> + + <!-- Class name of Carrot2 clustering algorithm. + + Currently available algorithms are: + + * org.carrot2.clustering.lingo.LingoClusteringAlgorithm + * org.carrot2.clustering.stc.STCClusteringAlgorithm + * org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm + + See http://project.carrot2.org/algorithms.html for the + algorithm's characteristics. + --> + <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str> + + <!-- Overriding values for Carrot2 default algorithm attributes. + + For a description of all available attributes, see: + http://download.carrot2.org/stable/manual/#chapter.components. + Use attribute key as name attribute of str elements + below. These can be further overridden for individual + requests by specifying attribute key as request parameter + name and attribute value as parameter value. + --> + <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str> + + <!-- Location of Carrot2 lexical resources. + + A directory from which to load Carrot2-specific stop words + and stop labels. Absolute or relative to Solr config directory. + If a specific resource (e.g. stopwords.en) is present in the + specified dir, it will completely override the corresponding + default one that ships with Carrot2. + + For an overview of Carrot2 lexical resources, see: + http://download.carrot2.org/head/manual/#chapter.lexical-resources + --> + <str name="carrot.lexicalResourcesDir">clustering/carrot2</str> + + <!-- The language to assume for the documents. + + For a list of allowed values, see: + http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage + --> + <str name="MultilingualClustering.defaultLanguage">ENGLISH</str> + </lst> + <lst name="engine"> + <str name="name">stc</str> + <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str> + </lst> + </searchComponent> + + <!-- A request handler for demonstrating the clustering component + + This is purely as an example. + + In reality you will likely want to add the component to your + already specified request handlers. + --> + <requestHandler name="/clustering" + startup="lazy" + enable="${solr.clustering.enabled:false}" + class="solr.SearchHandler"> + <lst name="defaults"> + <bool name="clustering">true</bool> + <str name="clustering.engine">default</str> + <bool name="clustering.results">true</bool> + <!-- The title field --> + <str name="carrot.title">name</str> + <str name="carrot.url">id</str> + <!-- The field to cluster on --> + <str name="carrot.snippet">features</str> + <!-- produce summaries --> + <bool name="carrot.produceSummary">true</bool> + <!-- the maximum number of labels per cluster --> + <!--<int name="carrot.numDescriptions">5</int>--> + <!-- produce sub clusters --> + <bool name="carrot.outputSubClusters">false</bool> + + <str name="defType">edismax</str> + <str name="qf"> + text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 + </str> + <str name="q.alt">*:*</str> + <str name="rows">10</str> + <str name="fl">*,score</str> + </lst> + <arr name="last-components"> + <str>clustering</str> + </arr> + </requestHandler> + + <!-- Terms Component + + http://wiki.apache.org/solr/TermsComponent + + A component to return terms and document frequency of those + terms + --> + <searchComponent name="terms" class="solr.TermsComponent"/> + + <!-- A request handler for demonstrating the terms component --> + <requestHandler name="/terms" class="solr.SearchHandler" startup="lazy"> + <lst name="defaults"> + <bool name="terms">true</bool> + </lst> + <arr name="components"> + <str>terms</str> + </arr> + </requestHandler> + + + <!-- Query Elevation Component + + http://wiki.apache.org/solr/QueryElevationComponent + + a search component that enables you to configure the top + results for a given query regardless of the normal lucene + scoring. + --> +<!-- <searchComponent name="elevator" class="solr.QueryElevationComponent" >--> + <!-- pick a fieldType to analyze queries --> + <!--<str name="queryFieldType">string</str> + <str name="config-file">elevate.xml</str> + </searchComponent>--> + + <!-- A request handler for demonstrating the elevator component --> +<!-- <requestHandler name="/elevate" class="solr.SearchHandler" startup="lazy"> + <lst name="defaults"> + <str name="echoParams">explicit</str> + <str name="df">text</str> + </lst> + <arr name="last-components"> + <str>elevator</str> + </arr> + </requestHandler>--> + + <!-- Highlighting Component + + http://wiki.apache.org/solr/HighlightingParameters + --> + <searchComponent class="solr.HighlightComponent" name="highlight"> + <highlighting> + <!-- Configure the standard fragmenter --> + <!-- This could most likely be commented out in the "default" case --> + <fragmenter name="gap" + default="true" + class="solr.highlight.GapFragmenter"> + <lst name="defaults"> + <int name="hl.fragsize">100</int> + </lst> + </fragmenter> + + <!-- A regular-expression-based fragmenter + (for sentence extraction) + --> + <fragmenter name="regex" + class="solr.highlight.RegexFragmenter"> + <lst name="defaults"> + <!-- slightly smaller fragsizes work better because of slop --> + <int name="hl.fragsize">70</int> + <!-- allow 50% slop on fragment sizes --> + <float name="hl.regex.slop">0.5</float> + <!-- a basic sentence pattern --> + <str name="hl.regex.pattern">[-\w ,/\n\"']{20,200}</str> + </lst> + </fragmenter> + + <!-- Configure the standard formatter --> + <formatter name="html" + default="true" + class="solr.highlight.HtmlFormatter"> + <lst name="defaults"> + <str name="hl.simple.pre"><![CDATA[<em>]]></str> + <str name="hl.simple.post"><![CDATA[</em>]]></str> + </lst> + </formatter> + + <!-- Configure the standard encoder --> + <encoder name="html" + class="solr.highlight.HtmlEncoder" /> + + <!-- Configure the standard fragListBuilder --> + <fragListBuilder name="simple" + class="solr.highlight.SimpleFragListBuilder"/> + + <!-- Configure the single fragListBuilder --> + <fragListBuilder name="single" + class="solr.highlight.SingleFragListBuilder"/> + + <!-- Configure the weighted fragListBuilder --> + <fragListBuilder name="weighted" + default="true" + class="solr.highlight.WeightedFragListBuilder"/> + + <!-- default tag FragmentsBuilder --> + <fragmentsBuilder name="default" + default="true" + class="solr.highlight.ScoreOrderFragmentsBuilder"> + <!-- + <lst name="defaults"> + <str name="hl.multiValuedSeparatorChar">/</str> + </lst> + --> + </fragmentsBuilder> + + <!-- multi-colored tag FragmentsBuilder --> + <fragmentsBuilder name="colored" + class="solr.highlight.ScoreOrderFragmentsBuilder"> + <lst name="defaults"> + <str name="hl.tag.pre"><![CDATA[ + <b style="background:yellow">,<b style="background:lawgreen">, + <b style="background:aquamarine">,<b style="background:magenta">, + <b style="background:palegreen">,<b style="background:coral">, + <b style="background:wheat">,<b style="background:khaki">, + <b style="background:lime">,<b style="background:deepskyblue">]]></str> + <str name="hl.tag.post"><![CDATA[</b>]]></str> + </lst> + </fragmentsBuilder> + + <boundaryScanner name="default" + default="true" + class="solr.highlight.SimpleBoundaryScanner"> + <lst name="defaults"> + <str name="hl.bs.maxScan">10</str> + <str name="hl.bs.chars">.,!? </str> + </lst> + </boundaryScanner> + + <boundaryScanner name="breakIterator" + class="solr.highlight.BreakIteratorBoundaryScanner"> + <lst name="defaults"> + <!-- type should be one of CHARACTER, WORD(default), LINE and SENTENCE --> + <str name="hl.bs.type">WORD</str> + <!-- language and country are used when constructing Locale object. --> + <!-- And the Locale object will be used when getting instance of BreakIterator --> + <str name="hl.bs.language">en</str> + <str name="hl.bs.country">US</str> + </lst> + </boundaryScanner> + </highlighting> + </searchComponent> + + <!-- Update Processors + + Chains of Update Processor Factories for dealing with Update + Requests can be declared, and then used by name in Update + Request Processors + + http://wiki.apache.org/solr/UpdateRequestProcessor + + --> + <!-- Deduplication + + An example dedup update processor that creates the "id" field + on the fly based on the hash code of some other fields. This + example has overwriteDupes set to false since we are using the + id field as the signatureField and Solr will maintain + uniqueness based on that anyway. + + --> + <!-- + <updateRequestProcessorChain name="dedupe"> + <processor class="solr.processor.SignatureUpdateProcessorFactory"> + <bool name="enabled">true</bool> + <str name="signatureField">id</str> + <bool name="overwriteDupes">false</bool> + <str name="fields">name,features,cat</str> + <str name="signatureClass">solr.processor.Lookup3Signature</str> + </processor> + <processor class="solr.LogUpdateProcessorFactory" /> + <processor class="solr.RunUpdateProcessorFactory" /> + </updateRequestProcessorChain> + --> + + <!-- Language identification + + This example update chain identifies the language of the incoming + documents using the langid contrib. The detected language is + written to field language_s. No field name mapping is done. + The fields used for detection are text, title, subject and description, + making this example suitable for detecting languages form full-text + rich documents injected via ExtractingRequestHandler. + See more about langId at http://wiki.apache.org/solr/LanguageDetection + --> + <!-- + <updateRequestProcessorChain name="langid"> + <processor class="org.apache.solr.update.processor.TikaLanguageIdentifierUpdateProcessorFactory"> + <str name="langid.fl">text,title,subject,description</str> + <str name="langid.langField">language_s</str> + <str name="langid.fallback">en</str> + </processor> + <processor class="solr.LogUpdateProcessorFactory" /> + <processor class="solr.RunUpdateProcessorFactory" /> + </updateRequestProcessorChain> + --> + + <!-- Script update processor + + This example hooks in an update processor implemented using JavaScript. + + See more about the script update processor at http://wiki.apache.org/solr/ScriptUpdateProcessor + --> + <!-- + <updateRequestProcessorChain name="script"> + <processor class="solr.StatelessScriptUpdateProcessorFactory"> + <str name="script">update-script.js</str> + <lst name="params"> + <str name="config_param">example config parameter</str> + </lst> + </processor> + <processor class="solr.RunUpdateProcessorFactory" /> + </updateRequestProcessorChain> + --> + + <!-- Response Writers + + http://wiki.apache.org/solr/QueryResponseWriter + + Request responses will be written using the writer specified by + the 'wt' request parameter matching the name of a registered + writer. + + The "default" writer is the default and will be used if 'wt' is + not specified in the request. + --> + <!-- The following response writers are implicitly configured unless + overridden... + --> + <!-- + <queryResponseWriter name="xml" + default="true" + class="solr.XMLResponseWriter" /> + <queryResponseWriter name="json" class="solr.JSONResponseWriter"/> + <queryResponseWriter name="python" class="solr.PythonResponseWriter"/> + <queryResponseWriter name="ruby" class="solr.RubyResponseWriter"/> + <queryResponseWriter name="php" class="solr.PHPResponseWriter"/> + <queryResponseWriter name="phps" class="solr.PHPSerializedResponseWriter"/> + <queryResponseWriter name="csv" class="solr.CSVResponseWriter"/> + --> + + <queryResponseWriter name="json" class="solr.JSONResponseWriter"> + <!-- For the purposes of the tutorial, JSON responses are written as + plain text so that they are easy to read in *any* browser. + If you expect a MIME type of "application/json" just remove this override. + --> + <str name="content-type">text/plain; charset=UTF-8</str> + </queryResponseWriter> + + <!-- + Custom response writers can be declared as needed... + --> + <queryResponseWriter name="velocity" class="solr.VelocityResponseWriter" startup="lazy"/> + + + <!-- XSLT response writer transforms the XML output by any xslt file found + in Solr's conf/xslt directory. Changes to xslt files are checked for + every xsltCacheLifetimeSeconds. + --> + <queryResponseWriter name="xslt" class="solr.XSLTResponseWriter"> + <int name="xsltCacheLifetimeSeconds">5</int> + </queryResponseWriter> + + <!-- Query Parsers + + http://wiki.apache.org/solr/SolrQuerySyntax + + Multiple QParserPlugins can be registered by name, and then + used in either the "defType" param for the QueryComponent (used + by SearchHandler) or in LocalParams + --> + <!-- example of registering a query parser --> + <!-- + <queryParser name="myparser" class="com.mycompany.MyQParserPlugin"/> + --> + + <!-- Function Parsers + + http://wiki.apache.org/solr/FunctionQuery + + Multiple ValueSourceParsers can be registered by name, and then + used as function names when using the "func" QParser. + --> + <!-- example of registering a custom function parser --> + <!-- + <valueSourceParser name="myfunc" + class="com.mycompany.MyValueSourceParser" /> + --> + + + <!-- Document Transformers + http://wiki.apache.org/solr/DocTransformers + --> + <!-- + Could be something like: + <transformer name="db" class="com.mycompany.LoadFromDatabaseTransformer" > + <int name="connection">jdbc://....</int> + </transformer> + + To add a constant value to all docs, use: + <transformer name="mytrans2" class="org.apache.solr.response.transform.ValueAugmenterFactory" > + <int name="value">5</int> + </transformer> + + If you want the user to still be able to change it with _value:something_ use this: + <transformer name="mytrans3" class="org.apache.solr.response.transform.ValueAugmenterFactory" > + <double name="defaultValue">5</double> + </transformer> + + If you are using the QueryElevationComponent, you may wish to mark documents that get boosted. The + EditorialMarkerFactory will do exactly that: + <transformer name="qecBooster" class="org.apache.solr.response.transform.EditorialMarkerFactory" /> + --> + + + <!-- Legacy config for the admin interface --> + <admin> + <defaultQuery>*:*</defaultQuery> + </admin> + +</config> diff --git a/requirements.txt b/requirements.txt index 9cb79727f..c7e33c2f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -45,4 +45,6 @@ pyenchant # OAI-PMH pyoai +egenix-mx-base +sunburnt diff --git a/wolnelektury/settings/basic.py b/wolnelektury/settings/basic.py index 3dcb48450..e5ba5cd16 100644 --- a/wolnelektury/settings/basic.py +++ b/wolnelektury/settings/basic.py @@ -21,6 +21,8 @@ DATABASES = { } } +SOLR = "http://localhost:8983/solr/wl/" + # Local time zone for this installation. Choices can be found here: # http://en.wikipedia.org/wiki/List_of_tz_zones_by_name