X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/6672db0a403048d321eb97134c6a53866703dd4a..f2cd20cec6083c7bc8fb17706b1718faa09a6139:/doc/schema.xml?ds=sidebyside diff --git a/doc/schema.xml b/doc/schema.xml index ed51a8b99..d3cbbe835 100644 --- a/doc/schema.xml +++ b/doc/schema.xml @@ -90,50 +90,16 @@ and back compatibility is not guaranteed. Names with both leading and trailing underscores (e.g. _version_) are reserved. --> - - <!-- <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" /> --> - <!-- <field name="sku" type="text_en_splitting_tight" indexed="true" stored="true" omitNorms="true"/> --> - <!-- <field name="name" type="text_general" indexed="true" stored="true"/> --> - <!-- <field name="manu" type="text_general" indexed="true" stored="true" omitNorms="true"/> --> - <!-- <field name="cat" type="string" indexed="true" stored="true" multiValued="true"/> --> - <!-- <field name="features" type="text_general" indexed="true" stored="true" multiValued="true"/> --> - <!-- <field name="includes" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" /> --> - - <!-- <field name="weight" type="float" indexed="true" stored="true"/> --> - <!-- <field name="price" type="float" indexed="true" stored="true"/> --> - <!-- <field name="popularity" type="int" indexed="true" stored="true" /> --> - <!-- <field name="inStock" type="boolean" indexed="true" stored="true" /> --> - - <!-- <field name="store" type="location" indexed="true" stored="true"/> --> - - <!-- Common metadata fields, named specifically to match up with - SolrCell metadata when parsing rich documents such as Word, PDF. - Some fields are multiValued only because Tika currently may return - multiple values for them. Some metadata is parsed from the documents, - but there are some which come from the client context: - "content_type": From the HTTP headers of incoming stream - "resourcename": From SolrCell request param resource.name - --> - <!-- <field name="title" type="text_general" indexed="true" stored="true" multiValued="true"/> --> - <!-- <field name="subject" type="text_general" indexed="true" stored="true"/> --> - <!-- <field name="description" type="text_general" indexed="true" stored="true"/> --> - <!-- <field name="comments" type="text_general" indexed="true" stored="true"/> --> - <!-- <field name="author" type="text_general" indexed="true" stored="true"/> --> - <!-- <field name="keywords" type="text_general" indexed="true" stored="true"/> --> - <!-- <field name="category" type="text_general" indexed="true" stored="true"/> --> - <!-- <field name="resourcename" type="text_general" indexed="true" stored="true"/> --> - <!-- <field name="url" type="text_general" indexed="true" stored="true"/> --> - <!-- <field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/> --> - <!-- <field name="last_modified" type="date" indexed="true" stored="true"/> --> - <!-- <field name="links" type="string" indexed="true" stored="true" multiValued="true"/> --> <field name="book_id" type="int" indexed="true" stored="true" /> <field name="parent_id" type="int" indexed="false" stored="true" /> <field name="slug" type="lowercase" stored="false" indexed="true" omitNorms="true"/> <!-- no norms --> <field name="is_book" type="boolean" stored="false" indexed="true"/> <field name="authors" type="text_pl_nonstop" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true"/> + <field name="authors_nonstem" type="text_ascii" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true"/> <field name="translators" type="text_pl_nonstop" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true" /> <field name="title" type="text_pl_nonstop" stored="false" indexed="true"/> + <field name="title_nonstem" type="text_ascii" stored="false" indexed="true"/> <!-- <field name="published_date" type="tdate" stored="false" indexed="true"/>--> <field name="published_date" type="string" stored="true" indexed="true"/> @@ -142,13 +108,16 @@ <field name="genres" type="lowercase" stored="false" indexed="false" multiValued="true" /> <field name="metadata" type="text_pl_nonstop" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true" /> + <field name="metadata_nonstem" type="text_pl_nonstop" stored="false" indexed="true" multiValued="true" termPositions="true" termVectors="true" /> <field name="themes" type="lowercase" stored="true" indexed="true" termVectors="true" termPositions="true" multiValued="true" /> <field name="themes_pl" type="text_pl_nonstop" stored="true" indexed="true" termVectors="true" termPositions="true" multiValued="true" /> + <field name="themes_pl_nonstem" type="text_ascii" stored="true" indexed="true" termVectors="true" termPositions="true" multiValued="true" /> <field name="header_index" type="int" stored="true" indexed="true"/> <field name="header_span" type="int" stored="true" indexed="true"/> <field name="header_type" type="lowercase" stored="true" indexed="false"/> <field name="text" type="text_pl" stored="false" indexed="true" termVectors="true" termPositions="true" /> + <field name="text_nonstem" type="text_pl_nonstem" stored="false" indexed="true" termVectors="true" termPositions="true" /> <field name="snippets_position" type="int" stored="true" indexed="false"/> <field name="snippets_length" type="int" stored="true" indexed="false"/> @@ -164,27 +133,6 @@ <field name="tag_category" type="string" stored="true" indexed="true" /> <field name="is_pdcounter" type="boolean" stored="true" indexed="true" /> - <!-- Main body of document extracted by SolrCell. - NOTE: This field is not indexed by default, since it is also copied to "text" - using copyField below. This is to save space. Use this field for returning and - highlighting document content. Use the "text" field to search the content. --> - <!-- <field name="content" type="text_general" indexed="false" stored="true" multiValued="true"/> --> - - - <!-- catchall field, containing all other searchable text fields (implemented - via copyField further on in this schema --> - <!-- <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/> --> - - <!-- catchall text field that indexes tokens both normally and in reverse for efficient - leading wildcard queries. --> - <!-- <field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/> --> - - <!-- non-tokenized version of manufacturer to make it easier to sort or group - results by manufacturer. copied from "manu" via copyField --> - <!-- <field name="manu_exact" type="string" indexed="true" stored="false"/> --> - - <!-- <field name="payloads" type="payloads" indexed="true" stored="true"/> --> - <field name="_version_" type="long" indexed="true" stored="true"/> <!-- Uncommenting the following will create a "timestamp" field using @@ -208,21 +156,6 @@ --> <uniqueKey>uid</uniqueKey> - <!-- DEPRECATED: The defaultSearchField is consulted by various query parsers when - parsing a query string that isn't explicit about the field. Machine (non-user) - generated queries are best made explicit, or they can use the "df" request parameter - which takes precedence over this. - Note: Un-commenting defaultSearchField will be insufficient if your request handler - in solrconfig.xml defines "df", which takes precedence. That would need to be removed. - <defaultSearchField>text</defaultSearchField> --> - - <!-- DEPRECATED: The defaultOperator (AND|OR) is consulted by various query parsers - when parsing a query string to determine if a clause of the query should be marked as - required or optional, assuming the clause isn't already marked by some operator. - The default is OR, which is generally assumed so it is not a good idea to change it - globally here. The "q.op" request parameter takes precedence over this. - <solrQueryParser defaultOperator="OR"/> --> - <!-- copyField commands copy one field to another at the time a document is added to the index. It's used either to index the same field differently, or to add multiple fields to the same field for easier/faster searching. --> @@ -235,40 +168,16 @@ <copyField source="kinds" dest="metadata"/> <copyField source="genres" dest="metadata"/> -<!-- - <copyField source="cat" dest="text"/> - <copyField source="name" dest="text"/> - <copyField source="manu" dest="text"/> - <copyField source="features" dest="text"/> - <copyField source="includes" dest="text"/> - <copyField source="manu" dest="manu_exact"/> ---> - <!-- Copy the price into a currency enabled field (default USD) --> -<!-- <copyField source="price" dest="price_c"/>--> - - <!-- Text fields from SolrCell to search by default in our catch-all field --> -<!-- <copyField source="title" dest="text"/> - <copyField source="author" dest="text"/> - <copyField source="description" dest="text"/> - <copyField source="keywords" dest="text"/> - <copyField source="content" dest="text"/> - <copyField source="content_type" dest="text"/> - <copyField source="resourcename" dest="text"/> - <copyField source="url" dest="text"/>--> - - <!-- Create a string version of author for faceting --> -<!-- <copyField source="author" dest="author_s"/>--> - - <!-- Above, multiple source fields are copied to the [text] field. - Another way to map multiple source fields to the same - destination field is to use the dynamic field syntax. - copyField also supports a maxChars to copy setting. --> - - <!-- <copyField source="*_t" dest="text" maxChars="3000"/> --> - - <!-- copy name to alphaNameSort, a field designed for sorting by name --> - <!-- <copyField source="name" dest="alphaNameSort"/> --> - + <copyField source="translators" dest="metadata_nonstem"/> + <copyField source="epochs" dest="metadata_nonstem"/> + <copyField source="kinds" dest="metadata_nonstem"/> + <copyField source="genres" dest="metadata_nonstem"/> + + <copyField source="authors" dest="authors_nonstem"/> + <copyField source="title" dest="title_nonstem"/> + <copyField source="themes" dest="themes_pl_nonstem"/> + <copyField source="text" dest="text_nonstem"/> + <types> <!-- field type definitions. The "name" attribute is just a label to be used by field definitions. The "class" @@ -287,8 +196,8 @@ <!-- sortMissingLast and sortMissingFirst attributes are optional attributes are currently supported on types that are sorted internally as strings and on numeric types. - This includes "string","boolean", and, as of 3.5 (and 4.x), - int, float, long, date, double, including the "Trie" variants. + This includes "string","boolean", and, as of 3.5 (and 4.x), + int, float, long, date, double, including the "Trie" variants. - If sortMissingLast="true", then a sort on this field will cause documents without the field to come after documents with the field, regardless of the requested sort order (asc or desc). @@ -367,26 +276,6 @@ --> <fieldType name="random" class="solr.RandomSortField" indexed="true" /> - <!-- solr.TextField allows the specification of custom text analyzers - specified as a tokenizer and a list of token filters. Different - analyzers may be specified for indexing and querying. - - The optional positionIncrementGap puts space between multiple fields of - this type on the same document, with the purpose of preventing false phrase - matching across fields. - - For more info on customizing your analyzer chain, please see - http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters - --> - - <!-- One can also specify an existing Analyzer class that has a - default constructor via the class attribute on the analyzer element. - Example: - <fieldType name="text_greek" class="solr.TextField"> - <analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/> - </fieldType> - --> - <fieldType name="uuid" class="solr.UUIDField" indexed="true" /> @@ -421,7 +310,14 @@ <!-- Polish --> <fieldType name="text_pl" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> - <analyzer> + <analyzer type="index"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pl.txt" format="snowball"/> + <filter class="solr.MorfologikFilterFactory" dictionary="morfologik/stemming/polish/polish.dict" /> + <filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true" /> + </analyzer> + <analyzer type="query"> <tokenizer class="solr.StandardTokenizerFactory"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pl.txt" format="snowball"/> @@ -429,14 +325,46 @@ </analyzer> </fieldType> + <fieldType name="text_pl_nonstem" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> + <analyzer type="index"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pl.txt" format="snowball"/> + <filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true" /> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pl.txt" format="snowball"/> + </analyzer> + </fieldType> + <fieldType name="text_pl_nonstop" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> - <analyzer> + <analyzer type="index"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.MorfologikFilterFactory" dictionary="morfologik/stemming/polish/polish.dict" /> + <filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true" /> + </analyzer> + <analyzer type="query"> <tokenizer class="solr.StandardTokenizerFactory"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.MorfologikFilterFactory" dictionary="morfologik/stemming/polish/polish.dict" /> </analyzer> </fieldType> + <fieldType name="text_ascii" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> + <analyzer type="index"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true" /> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + </fieldType> +cb </types> <!-- Similarity is the scoring routine for each document vs. a query.