lucene-java-3.4.0/lucene/contrib/xml-query-parser/LuceneContribQuery.dtd

   1 <!--
   2         This DTD builds on the <a href="LuceneCoreQuery.dtd.html">core Lucene XML syntax</a> and adds support for features found in the "contrib" section of the Lucene project.
   3
   4         CorePlusExtensionsParser.java is the Java class that encapsulates this parser behaviour.
   5
   6
   7         The features added are:
   8         <ul>
   9         <li><a href="#LikeThisQuery">LikeThisQuery</a></li>
  10            Support for querying using large amounts of example text indicative of the users' general area of interest
  11         <li><a href="#FuzzyLikeThisQuery">FuzzyLikeThisQuery</a></li>
  12            A style of fuzzy query which automatically looks for fuzzy variations on only the "interesting" terms
  13         <li><a href="#BooleanFilter">BooleanFilter</a></li>
  14            Is to Filters what core Lucene's BooleanQuery is to Queries - allows mixing of clauses using Boolean logic
  15         <li><a href="#TermsFilter">TermsFilter</a></li>
  16            Constructs a filter from an arbitrary set of terms (unlike <a href="#RangeFilter">RangeFilter</a> which requires a contiguous range of terms)
  17         <li><a href="#DuplicateFilter">DuplicateFilter</a></li>
  18            Removes duplicated documents from results where "duplicate" means documents share a value for a particular field (e.g. a primary key)
  19         <li><a href="#BoostingQuery">BoostingQuery</a></li>
  20            Influence score of a query's matches in a subtle way which can't be achieved using BooleanQuery
  21         </ul>
  22         @title Contrib Lucene
  23 -->
  24 <!-- @hidden include the core DTD -->
  25 <!ENTITY % coreParserDTD SYSTEM "LuceneCoreQuery.dtd" >
  26
  27
  28 <!-- @hidden Allow for extensions -->
  29 <!ENTITY % extendedSpanQueries2 " " >
  30 <!ENTITY % extendedQueries2 " " >
  31 <!ENTITY % extendedFilters2 " " >
  32
  33
  34 <!ENTITY % extendedQueries1 "|LikeThisQuery|BoostingQuery|FuzzyLikeThisQuery%extendedQueries2;%extendedSpanQueries2;" >
  35 <!ENTITY % extendedFilters1 "|TermsFilter|BooleanFilter|DuplicateFilter%extendedFilters2;" >
  36
  37
  38 %coreParserDTD;
  39
  40 <!--
  41 Performs fuzzy matching on "significant" terms in fields. Improves on "LikeThisQuery" by allowing for fuzzy variations of supplied fields.
  42 Improves on FuzzyQuery by rewarding all fuzzy variants of a term with the same IDF rather than default fuzzy behaviour which ranks rarer
  43         variants (typically misspellings) more highly. This can be a useful default search mode for processing user input where the end user
  44         is not expected to know about the standard query operators for fuzzy, boolean or phrase logic found in UserQuery
  45         @example
  46                 <em>Search for information about the Sumitomo bank, where the end user has mis-spelt the name</em>
  47                 %
  48             <FuzzyLikeThisQuery>
  49                 <Field fieldName="contents">
  50                              Sumitimo bank
  51                     </Field>
  52             </FuzzyLikeThisQuery>
  53                  %
  54 -->
  55 <!ELEMENT FuzzyLikeThisQuery (Field)*>
  56 <!-- Optional boost for matches on this query. Values > 1 -->
  57 <!ATTLIST FuzzyLikeThisQuery boost CDATA "1.0">
  58 <!-- Limits the total number of terms selected from the provided text plus the selected "fuzzy" variants -->
  59 <!ATTLIST FuzzyLikeThisQuery maxNumTerms CDATA "50">
  60 <!-- Ignore "Term Frequency" - a boost factor which rewards multiple occurences of the same term in a document -->
  61 <!ATTLIST FuzzyLikeThisQuery ignoreTF (true|false) "false">
  62 <!-- A field used in a FuzzyLikeThisQuery -->
  63 <!ELEMENT Field (#PCDATA)>
  64 <!-- Controls the level of similarity required for fuzzy variants where 1 is identical and 0.5 is that the variant contains
  65         half of the original's characters in the same order. Lower values produce more results but may take longer to execute due to
  66         additional IO required to read matching document ids-->
  67 <!ATTLIST Field minSimilarity CDATA "0.5">
  68 <!-- Controls the minimum number of characters at the start of fuzzy variant words that must exactly match the original.
  69         A value of zero will require no minimum and the search software will effectively scan ALL terms from a to z looking for variations.
  70         This can incur high CPU overhead and a prefix length of just "1" will reduce this overhead to 1/26th of the original cost (assuming
  71         an even distribution of letters used from the alphabet).
  72  -->
  73 <!ATTLIST Field prefixLength CDATA "1">
  74 <!-- fieldName must be defined here or is taken from the most immediate parent XML element that defines a "fieldName" attribute -->
  75 <!ATTLIST Field fieldName CDATA #IMPLIED>
  76
  77
  78
  79 <!--
  80         Cherry-picks "significant" terms from the example child text and queries using these words. By only using significant (read: rare) terms the
  81         performance cost of the query is substantially reduced and large bodies of text can be used as example content.
  82         @example
  83                 <em>Use a block of text as an example of the type of content to be found, ignoring the "Reuters" word which
  84                appears commonly in the index.</em>
  85                 %
  86             <LikeThisQuery percentTermsToMatch="5" stopWords="Reuters">
  87                 IRAQI TROOPS REPORTED PUSHING BACK IRANIANS Iraq said today its troops were pushing Iranian forces out of
  88                 positions they had initially occupied when they launched a new offensive near the southern port of
  89                 Basra early yesterday.     A High Command communique said Iraqi troops had won a significant victory
  90                 and were continuing to advance.     Iraq said it had foiled a three-pronged thrust some 10 km
  91                 (six miles) from Basra, but admitted the Iranians had occupied ground held by the Mohammed al-Qassem
  92                 unit, one of three divisions attacked.     The communique said Iranian Revolutionary Guards were under
  93                 assault from warplanes, helicopter gunships, heavy artillery and tanks.     "Our forces are continuing
  94                 their advance until they purge the last foothold" occupied by the Iranians, it said.
  95                 (Iran said its troops had killed or wounded more than 4,000 Iraqis and were stabilising their new positions.)
  96                 The Baghdad communique said Iraqi planes also destroyed oil installations at Iran's southwestern Ahvaz field
  97                 during a raid today. It denied an Iranian report that an Iraqi jet was shot down.
  98                 Iraq also reported a naval battle at the northern tip of the Gulf. Iraqi naval units and forces defending an
  99                 offshore terminal sank six Iranian out of 28 Iranian boats attempting to attack an offshore terminal,
 100                 the communique said.      Reuters 3;
 101             </LikeThisQuery>
 102                 %
 103         -->
 104 <!ELEMENT LikeThisQuery (#PCDATA)>
 105 <!-- Optional boost for matches on this query. Values > 1 -->
 106 <!ATTLIST LikeThisQuery boost CDATA "1.0">
 107 <!-- Comma delimited list of field names -->
 108 <!ATTLIST LikeThisQuery fieldNames CDATA #IMPLIED>
 109 <!-- a list of stop words - analyzed to produce stop terms -->
 110 <!ATTLIST LikeThisQuery stopWords CDATA #IMPLIED>
 111 <!-- controls the maximum number of words shortlisted for the query. The higher the number the slower the response due to more disk reads required -->
 112 <!ATTLIST LikeThisQuery maxQueryTerms CDATA "20">
 113 <!-- Controls how many times a term must appear in the example text before it is shortlisted for use in the query -->
 114 <!ATTLIST LikeThisQuery minTermFrequency CDATA "1">
 115 <!-- A quality control that can be used to limit the number of results to those documents matching a certain percentage of the shortlisted query terms.
 116         Values must be between 1 and 100-->
 117 <!ATTLIST LikeThisQuery percentTermsToMatch CDATA "30">
 118
 119 <!--
 120         Requires matches on the "Query" element and optionally boosts by any matches on the "BoostQuery".
 121         Unlike a regular BooleanQuery the boost can be less than 1 to produce a subtractive rather than additive result
 122         on the match score.
 123         @example <em>Find documents about banks, preferably related to mergers, and preferably not about "World bank"</em>
 124     %
 125         <BoostingQuery>
 126       <Query>
 127          <BooleanQuery fieldName="contents">
 128            <Clause occurs="should">
 129               <TermQuery>merger</TermQuery>
 130            </Clause>
 131            <Clause occurs="must">
 132               <TermQuery>bank</TermQuery>
 133            </Clause>
 134          </BooleanQuery>
 135       </Query>
 136       <BoostQuery boost="0.01">
 137          <UserQuery>"world bank"</UserQuery>
 138       </BoostQuery>
 139     </BoostingQuery>
 140         %
 141
 142 -->
 143 <!ELEMENT BoostingQuery (Query,BoostQuery)>
 144 <!-- Optional boost for matches on this query. Values > 1 -->
 145 <!ATTLIST BoostingQuery boost CDATA "1.0">
 146
 147 <!--
 148         Child element of BoostingQuery used to contain the choice of Query which is used for boosting purposes
 149 -->
 150 <!ELEMENT BoostQuery (%queries;)>
 151 <!-- Optional boost for matches on this query. A boost of >0 but <1
 152         effectively demotes results from Query that match this BoostQuery.
 153         -->
 154 <!ATTLIST BoostQuery boost CDATA "1.0">
 155
 156
 157
 158 <!-- Removes duplicated documents from results where "duplicate" means documents share a value for a particular field such as a primary key
 159         @example <em>Find the latest version of each web page that mentions "Lucene"</em>
 160         %
 161     <FilteredQuery>
 162       <Query>
 163          <TermQuery fieldName="text">lucene</TermQuery>
 164       </Query>
 165           <Filter>
 166                 <DuplicateFilter fieldName="url" keepMode="last"/>
 167           </Filter>
 168     </FilteredQuery>
 169         %
 170         -->
 171 <!ELEMENT DuplicateFilter EMPTY>
 172 <!-- fieldName must be defined here or is taken from the most immediate parent XML element that defines a "fieldName" attribute -->
 173 <!ATTLIST DuplicateFilter fieldName CDATA #IMPLIED>
 174 <!-- Determines if the first or last document occurence is the one to return when presented with duplicated field values -->
 175 <!ATTLIST DuplicateFilter keepMode (first | last) "first">
 176 <!-- Controls the choice of process used to produce the filter - "full" mode identifies only non-duplicate documents with the chosen field
 177         while "fast" mode may perform faster but will also mark documents <em>without</em> the field as valid. The former approach starts by
 178         assuming every document is a duplicate then finds the "master" documents to keep while the latter approach assumes all documents are
 179         unique and      unmarks those documents that are a copy.
 180         -->
 181 <!ATTLIST DuplicateFilter processingMode (full | fast) "full">
 182
 183
 184
 185
 186 <!-- Processes child text using a field-specific choice of Analyzer to produce a set of terms that are then used as a filter.
 187         @example <em>Find documents talking about Lucene written on a Monday or a Friday</em>
 188         %
 189     <FilteredQuery>
 190       <Query>
 191          <TermQuery fieldName="text">lucene</TermQuery>
 192       </Query>
 193         <Filter>
 194                 <TermsFilter fieldName="dayOfWeek">monday friday</TermsFilter>
 195         </Filter>
 196     </FilteredQuery>
 197         %
 198
 199         -->
 200 <!ELEMENT TermsFilter (#PCDATA)>
 201 <!-- fieldName must be defined here or is taken from the most immediate parent XML element that defines a "fieldName" attribute -->
 202 <!ATTLIST TermsFilter fieldName CDATA #IMPLIED>
 203 <!--
 204         A Filter equivalent to BooleanQuery that applies Boolean logic to Clauses containing Filters.
 205         Unlike BooleanQuery a BooleanFilter can contain a single "mustNot" clause.
 206         @example <em>Find documents from the first quarter of this year or last year that are not in "draft" status</em>
 207         %
 208      <FilteredQuery>
 209        <Query>
 210            <MatchAllDocsQuery/>
 211        </Query>
 212        <Filter>
 213         <BooleanFilter>
 214           <Clause occurs="should">
 215              <RangeFilter fieldName="date" lowerTerm="20070101" upperTerm="20070401"/>
 216           </Clause>
 217           <Clause occurs="should">
 218              <RangeFilter fieldName="date" lowerTerm="20060101" upperTerm="20060401"/>
 219           </Clause>
 220           <Clause occurs="mustNot">
 221              <TermsFilter fieldName="status">draft</TermsFilter>
 222           </Clause>
 223         </BooleanFilter>
 224        </Filter>
 225     </FilteredQuery>
 226         %
 227         -->
 228 <!ELEMENT BooleanFilter (Clause)+>
 229