lucene-java-3.5.0/lucene/src/java/org/apache/lucene/document/Field.java

   1 package org.apache.lucene.document;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.TokenStream;
  21 import org.apache.lucene.index.IndexWriter;   // for javadoc
  22 import org.apache.lucene.util.StringHelper;
  23
  24 import java.io.Reader;
  25 import java.io.Serializable;
  26
  27 import org.apache.lucene.index.FieldInfo.IndexOptions;
  28
  29 /**
  30   A field is a section of a Document.  Each field has two parts, a name and a
  31   value.  Values may be free text, provided as a String or as a Reader, or they
  32   may be atomic keywords, which are not further processed.  Such keywords may
  33   be used to represent dates, urls, etc.  Fields are optionally stored in the
  34   index, so that they may be returned with hits on the document.
  35   */
  36
  37 public final class Field extends AbstractField implements Fieldable, Serializable {
  38
  39   /** Specifies whether and how a field should be stored. */
  40   public static enum Store {
  41
  42     /** Store the original field value in the index. This is useful for short texts
  43      * like a document's title which should be displayed with the results. The
  44      * value is stored in its original form, i.e. no analyzer is used before it is
  45      * stored.
  46      */
  47     YES {
  48       @Override
  49       public boolean isStored() { return true; }
  50     },
  51
  52     /** Do not store the field value in the index. */
  53     NO {
  54       @Override
  55       public boolean isStored() { return false; }
  56     };
  57
  58     public abstract boolean isStored();
  59   }
  60
  61   /** Specifies whether and how a field should be indexed. */
  62   public static enum Index {
  63
  64     /** Do not index the field value. This field can thus not be searched,
  65      * but one can still access its contents provided it is
  66      * {@link Field.Store stored}. */
  67     NO {
  68       @Override
  69       public boolean isIndexed()  { return false; }
  70       @Override
  71       public boolean isAnalyzed() { return false; }
  72       @Override
  73       public boolean omitNorms()  { return true;  }
  74     },
  75
  76     /** Index the tokens produced by running the field's
  77      * value through an Analyzer.  This is useful for
  78      * common text. */
  79     ANALYZED {
  80       @Override
  81       public boolean isIndexed()  { return true;  }
  82       @Override
  83       public boolean isAnalyzed() { return true;  }
  84       @Override
  85       public boolean omitNorms()  { return false; }
  86     },
  87
  88     /** Index the field's value without using an Analyzer, so it can be searched.
  89      * As no analyzer is used the value will be stored as a single term. This is
  90      * useful for unique Ids like product numbers.
  91      */
  92     NOT_ANALYZED {
  93       @Override
  94       public boolean isIndexed()  { return true;  }
  95       @Override
  96       public boolean isAnalyzed() { return false; }
  97       @Override
  98       public boolean omitNorms()  { return false; }
  99     },
 100
 101     /** Expert: Index the field's value without an Analyzer,
 102      * and also disable the indexing of norms.  Note that you
 103      * can also separately enable/disable norms by calling
 104      * {@link Field#setOmitNorms}.  No norms means that
 105      * index-time field and document boosting and field
 106      * length normalization are disabled.  The benefit is
 107      * less memory usage as norms take up one byte of RAM
 108      * per indexed field for every document in the index,
 109      * during searching.  Note that once you index a given
 110      * field <i>with</i> norms enabled, disabling norms will
 111      * have no effect.  In other words, for this to have the
 112      * above described effect on a field, all instances of
 113      * that field must be indexed with NOT_ANALYZED_NO_NORMS
 114      * from the beginning. */
 115     NOT_ANALYZED_NO_NORMS {
 116       @Override
 117       public boolean isIndexed()  { return true;  }
 118       @Override
 119       public boolean isAnalyzed() { return false; }
 120       @Override
 121       public boolean omitNorms()  { return true;  }
 122     },
 123
 124     /** Expert: Index the tokens produced by running the
 125      *  field's value through an Analyzer, and also
 126      *  separately disable the storing of norms.  See
 127      *  {@link #NOT_ANALYZED_NO_NORMS} for what norms are
 128      *  and why you may want to disable them. */
 129     ANALYZED_NO_NORMS {
 130       @Override
 131       public boolean isIndexed()  { return true;  }
 132       @Override
 133       public boolean isAnalyzed() { return true;  }
 134       @Override
 135       public boolean omitNorms()  { return true;  }
 136     };
 137
 138     /** Get the best representation of the index given the flags. */
 139     public static Index toIndex(boolean indexed, boolean analyzed) {
 140       return toIndex(indexed, analyzed, false);
 141     }
 142
 143     /** Expert: Get the best representation of the index given the flags. */
 144     public static Index toIndex(boolean indexed, boolean analyzed, boolean omitNorms) {
 145
 146       // If it is not indexed nothing else matters
 147       if (!indexed) {
 148         return Index.NO;
 149       }
 150
 151       // typical, non-expert
 152       if (!omitNorms) {
 153         if (analyzed) {
 154           return Index.ANALYZED;
 155         }
 156         return Index.NOT_ANALYZED;
 157       }
 158
 159       // Expert: Norms omitted
 160       if (analyzed) {
 161         return Index.ANALYZED_NO_NORMS;
 162       }
 163       return Index.NOT_ANALYZED_NO_NORMS;
 164     }
 165
 166     public abstract boolean isIndexed();
 167     public abstract boolean isAnalyzed();
 168     public abstract boolean omitNorms();
 169   }
 170
 171   /** Specifies whether and how a field should have term vectors. */
 172   public static enum TermVector {
 173
 174     /** Do not store term vectors.
 175      */
 176     NO {
 177       @Override
 178       public boolean isStored()      { return false; }
 179       @Override
 180       public boolean withPositions() { return false; }
 181       @Override
 182       public boolean withOffsets()   { return false; }
 183     },
 184
 185     /** Store the term vectors of each document. A term vector is a list
 186      * of the document's terms and their number of occurrences in that document. */
 187     YES {
 188       @Override
 189       public boolean isStored()      { return true;  }
 190       @Override
 191       public boolean withPositions() { return false; }
 192       @Override
 193       public boolean withOffsets()   { return false; }
 194     },
 195
 196     /**
 197      * Store the term vector + token position information
 198      *
 199      * @see #YES
 200      */
 201     WITH_POSITIONS {
 202       @Override
 203       public boolean isStored()      { return true;  }
 204       @Override
 205       public boolean withPositions() { return true;  }
 206       @Override
 207       public boolean withOffsets()   { return false; }
 208     },
 209
 210     /**
 211      * Store the term vector + Token offset information
 212      *
 213      * @see #YES
 214      */
 215     WITH_OFFSETS {
 216       @Override
 217       public boolean isStored()      { return true;  }
 218       @Override
 219       public boolean withPositions() { return false; }
 220       @Override
 221       public boolean withOffsets()   { return true;  }
 222     },
 223
 224     /**
 225      * Store the term vector + Token position and offset information
 226      *
 227      * @see #YES
 228      * @see #WITH_POSITIONS
 229      * @see #WITH_OFFSETS
 230      */
 231     WITH_POSITIONS_OFFSETS {
 232       @Override
 233       public boolean isStored()      { return true;  }
 234       @Override
 235       public boolean withPositions() { return true;  }
 236       @Override
 237       public boolean withOffsets()   { return true;  }
 238     };
 239
 240     /** Get the best representation of a TermVector given the flags. */
 241     public static TermVector toTermVector(boolean stored, boolean withOffsets, boolean withPositions) {
 242
 243       // If it is not stored, nothing else matters.
 244       if (!stored) {
 245         return TermVector.NO;
 246       }
 247
 248       if (withOffsets) {
 249         if (withPositions) {
 250           return Field.TermVector.WITH_POSITIONS_OFFSETS;
 251         }
 252         return Field.TermVector.WITH_OFFSETS;
 253       }
 254
 255       if (withPositions) {
 256         return Field.TermVector.WITH_POSITIONS;
 257       }
 258       return Field.TermVector.YES;
 259     }
 260
 261     public abstract boolean isStored();
 262     public abstract boolean withPositions();
 263     public abstract boolean withOffsets();
 264   }
 265
 266
 267   /** The value of the field as a String, or null.  If null, the Reader value or
 268    * binary value is used.  Exactly one of stringValue(),
 269    * readerValue(), and getBinaryValue() must be set. */
 270   public String stringValue()   { return fieldsData instanceof String ? (String)fieldsData : null; }
 271
 272   /** The value of the field as a Reader, or null.  If null, the String value or
 273    * binary value is used.  Exactly one of stringValue(),
 274    * readerValue(), and getBinaryValue() must be set. */
 275   public Reader readerValue()   { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
 276
 277   /** The TokesStream for this field to be used when indexing, or null.  If null, the Reader value
 278    * or String value is analyzed to produce the indexed tokens. */
 279   public TokenStream tokenStreamValue()   { return tokenStream; }
 280
 281
 282   /** <p>Expert: change the value of this field.  This can
 283    *  be used during indexing to re-use a single Field
 284    *  instance to improve indexing speed by avoiding GC cost
 285    *  of new'ing and reclaiming Field instances.  Typically
 286    *  a single {@link Document} instance is re-used as
 287    *  well.  This helps most on small documents.</p>
 288    *
 289    *  <p>Each Field instance should only be used once
 290    *  within a single {@link Document} instance.  See <a
 291    *  href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">ImproveIndexingSpeed</a>
 292    *  for details.</p> */
 293   public void setValue(String value) {
 294     if (isBinary) {
 295       throw new IllegalArgumentException("cannot set a String value on a binary field");
 296     }
 297     fieldsData = value;
 298   }
 299
 300   /** Expert: change the value of this field.  See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
 301   public void setValue(Reader value) {
 302     if (isBinary) {
 303       throw new IllegalArgumentException("cannot set a Reader value on a binary field");
 304     }
 305     if (isStored) {
 306       throw new IllegalArgumentException("cannot set a Reader value on a stored field");
 307     }
 308     fieldsData = value;
 309   }
 310
 311   /** Expert: change the value of this field.  See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
 312   public void setValue(byte[] value) {
 313     if (!isBinary) {
 314       throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field");
 315     }
 316     fieldsData = value;
 317     binaryLength = value.length;
 318     binaryOffset = 0;
 319   }
 320
 321   /** Expert: change the value of this field.  See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
 322   public void setValue(byte[] value, int offset, int length) {
 323     if (!isBinary) {
 324       throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field");
 325     }
 326     fieldsData = value;
 327     binaryLength = length;
 328     binaryOffset = offset;
 329   }
 330
 331   /** Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true.
 332    *  May be combined with stored values from stringValue() or getBinaryValue() */
 333   public void setTokenStream(TokenStream tokenStream) {
 334     this.isIndexed = true;
 335     this.isTokenized = true;
 336     this.tokenStream = tokenStream;
 337   }
 338
 339   /**
 340    * Create a field by specifying its name, value and how it will
 341    * be saved in the index. Term vectors will not be stored in the index.
 342    *
 343    * @param name The name of the field
 344    * @param value The string to process
 345    * @param store Whether <code>value</code> should be stored in the index
 346    * @param index Whether the field should be indexed, and if so, if it should
 347    *  be tokenized before indexing
 348    * @throws NullPointerException if name or value is <code>null</code>
 349    * @throws IllegalArgumentException if the field is neither stored nor indexed
 350    */
 351   public Field(String name, String value, Store store, Index index) {
 352     this(name, value, store, index, TermVector.NO);
 353   }
 354
 355   /**
 356    * Create a field by specifying its name, value and how it will
 357    * be saved in the index.
 358    *
 359    * @param name The name of the field
 360    * @param value The string to process
 361    * @param store Whether <code>value</code> should be stored in the index
 362    * @param index Whether the field should be indexed, and if so, if it should
 363    *  be tokenized before indexing
 364    * @param termVector Whether term vector should be stored
 365    * @throws NullPointerException if name or value is <code>null</code>
 366    * @throws IllegalArgumentException in any of the following situations:
 367    * <ul>
 368    *  <li>the field is neither stored nor indexed</li>
 369    *  <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
 370    * </ul>
 371    */
 372   public Field(String name, String value, Store store, Index index, TermVector termVector) {
 373     this(name, true, value, store, index, termVector);
 374   }
 375
 376   /**
 377    * Create a field by specifying its name, value and how it will
 378    * be saved in the index.
 379    *
 380    * @param name The name of the field
 381    * @param internName Whether to .intern() name or not
 382    * @param value The string to process
 383    * @param store Whether <code>value</code> should be stored in the index
 384    * @param index Whether the field should be indexed, and if so, if it should
 385    *  be tokenized before indexing
 386    * @param termVector Whether term vector should be stored
 387    * @throws NullPointerException if name or value is <code>null</code>
 388    * @throws IllegalArgumentException in any of the following situations:
 389    * <ul>
 390    *  <li>the field is neither stored nor indexed</li>
 391    *  <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
 392    * </ul>
 393    */
 394   public Field(String name, boolean internName, String value, Store store, Index index, TermVector termVector) {
 395     if (name == null)
 396       throw new NullPointerException("name cannot be null");
 397     if (value == null)
 398       throw new NullPointerException("value cannot be null");
 399     if (index == Index.NO && store == Store.NO)
 400       throw new IllegalArgumentException("it doesn't make sense to have a field that "
 401          + "is neither indexed nor stored");
 402     if (index == Index.NO && termVector != TermVector.NO)
 403       throw new IllegalArgumentException("cannot store term vector information "
 404          + "for a field that is not indexed");
 405
 406     if (internName) // field names are optionally interned
 407       name = StringHelper.intern(name);
 408
 409     this.name = name;
 410
 411     this.fieldsData = value;
 412
 413     this.isStored = store.isStored();
 414
 415     this.isIndexed = index.isIndexed();
 416     this.isTokenized = index.isAnalyzed();
 417     this.omitNorms = index.omitNorms();
 418     if (index == Index.NO) {
 419       // note: now this reads even wierder than before
 420       this.indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
 421     }
 422
 423     this.isBinary = false;
 424
 425     setStoreTermVector(termVector);
 426   }
 427
 428   /**
 429    * Create a tokenized and indexed field that is not stored. Term vectors will
 430    * not be stored.  The Reader is read only when the Document is added to the index,
 431    * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
 432    * has been called.
 433    *
 434    * @param name The name of the field
 435    * @param reader The reader with the content
 436    * @throws NullPointerException if name or reader is <code>null</code>
 437    */
 438   public Field(String name, Reader reader) {
 439     this(name, reader, TermVector.NO);
 440   }
 441
 442   /**
 443    * Create a tokenized and indexed field that is not stored, optionally with
 444    * storing term vectors.  The Reader is read only when the Document is added to the index,
 445    * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
 446    * has been called.
 447    *
 448    * @param name The name of the field
 449    * @param reader The reader with the content
 450    * @param termVector Whether term vector should be stored
 451    * @throws NullPointerException if name or reader is <code>null</code>
 452    */
 453   public Field(String name, Reader reader, TermVector termVector) {
 454     if (name == null)
 455       throw new NullPointerException("name cannot be null");
 456     if (reader == null)
 457       throw new NullPointerException("reader cannot be null");
 458
 459     this.name = StringHelper.intern(name);        // field names are interned
 460     this.fieldsData = reader;
 461
 462     this.isStored = false;
 463
 464     this.isIndexed = true;
 465     this.isTokenized = true;
 466
 467     this.isBinary = false;
 468
 469     setStoreTermVector(termVector);
 470   }
 471
 472   /**
 473    * Create a tokenized and indexed field that is not stored. Term vectors will
 474    * not be stored. This is useful for pre-analyzed fields.
 475    * The TokenStream is read only when the Document is added to the index,
 476    * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
 477    * has been called.
 478    *
 479    * @param name The name of the field
 480    * @param tokenStream The TokenStream with the content
 481    * @throws NullPointerException if name or tokenStream is <code>null</code>
 482    */
 483   public Field(String name, TokenStream tokenStream) {
 484     this(name, tokenStream, TermVector.NO);
 485   }
 486
 487   /**
 488    * Create a tokenized and indexed field that is not stored, optionally with
 489    * storing term vectors.  This is useful for pre-analyzed fields.
 490    * The TokenStream is read only when the Document is added to the index,
 491    * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
 492    * has been called.
 493    *
 494    * @param name The name of the field
 495    * @param tokenStream The TokenStream with the content
 496    * @param termVector Whether term vector should be stored
 497    * @throws NullPointerException if name or tokenStream is <code>null</code>
 498    */
 499   public Field(String name, TokenStream tokenStream, TermVector termVector) {
 500     if (name == null)
 501       throw new NullPointerException("name cannot be null");
 502     if (tokenStream == null)
 503       throw new NullPointerException("tokenStream cannot be null");
 504
 505     this.name = StringHelper.intern(name);        // field names are interned
 506     this.fieldsData = null;
 507     this.tokenStream = tokenStream;
 508
 509     this.isStored = false;
 510
 511     this.isIndexed = true;
 512     this.isTokenized = true;
 513
 514     this.isBinary = false;
 515
 516     setStoreTermVector(termVector);
 517   }
 518
 519
 520   /**
 521    * Create a stored field with binary value. Optionally the value may be compressed.
 522    *
 523    * @param name The name of the field
 524    * @param value The binary value
 525    * @param store Must be Store.YES
 526    * @throws IllegalArgumentException if store is <code>Store.NO</code>
 527    * @deprecated Use {@link #Field(String, byte[]) instead}
 528    */
 529   @Deprecated
 530   public Field(String name, byte[] value, Store store) {
 531     this(name, value, 0, value.length);
 532
 533     if (store == Store.NO) {
 534       throw new IllegalArgumentException("binary values can't be unstored");
 535     }
 536   }
 537
 538   /**
 539    * Create a stored field with binary value. Optionally the value may be compressed.
 540    *
 541    * @param name The name of the field
 542    * @param value The binary value
 543    */
 544   public Field(String name, byte[] value) {
 545     this(name, value, 0, value.length);
 546   }
 547
 548   /**
 549    * Create a stored field with binary value. Optionally the value may be compressed.
 550    *
 551    * @param name The name of the field
 552    * @param value The binary value
 553    * @param offset Starting offset in value where this Field's bytes are
 554    * @param length Number of bytes to use for this Field, starting at offset
 555    * @param store How <code>value</code> should be stored (compressed or not)
 556    * @throws IllegalArgumentException if store is <code>Store.NO</code>
 557    * @deprecated Use {@link #Field(String, byte[], int, int) instead}
 558    */
 559   @Deprecated
 560   public Field(String name, byte[] value, int offset, int length, Store store) {
 561     this(name, value, offset, length);
 562
 563     if (store == Store.NO) {
 564       throw new IllegalArgumentException("binary values can't be unstored");
 565     }
 566   }
 567
 568   /**
 569    * Create a stored field with binary value. Optionally the value may be compressed.
 570    *
 571    * @param name The name of the field
 572    * @param value The binary value
 573    * @param offset Starting offset in value where this Field's bytes are
 574    * @param length Number of bytes to use for this Field, starting at offset
 575    */
 576   public Field(String name, byte[] value, int offset, int length) {
 577
 578     if (name == null)
 579       throw new IllegalArgumentException("name cannot be null");
 580     if (value == null)
 581       throw new IllegalArgumentException("value cannot be null");
 582
 583     this.name = StringHelper.intern(name);        // field names are interned
 584     fieldsData = value;
 585
 586     isStored = true;
 587     isIndexed   = false;
 588     isTokenized = false;
 589     indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
 590     omitNorms = true;
 591
 592     isBinary    = true;
 593     binaryLength = length;
 594     binaryOffset = offset;
 595
 596     setStoreTermVector(TermVector.NO);
 597   }
 598 }