lucene-java-3.4.0/lucene/src/java/org/apache/lucene/document/Field.java

   1 package org.apache.lucene.document;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.TokenStream;
  21 import org.apache.lucene.index.IndexWriter;   // for javadoc
  22 import org.apache.lucene.util.StringHelper;
  23
  24 import java.io.Reader;
  25 import java.io.Serializable;
  26
  27 import org.apache.lucene.index.FieldInfo.IndexOptions;
  28
  29 /**
  30   A field is a section of a Document.  Each field has two parts, a name and a
  31   value.  Values may be free text, provided as a String or as a Reader, or they
  32   may be atomic keywords, which are not further processed.  Such keywords may
  33   be used to represent dates, urls, etc.  Fields are optionally stored in the
  34   index, so that they may be returned with hits on the document.
  35   */
  36
  37 public final class Field extends AbstractField implements Fieldable, Serializable {
  38
  39   /** Specifies whether and how a field should be stored. */
  40   public static enum Store {
  41
  42     /** Store the original field value in the index. This is useful for short texts
  43      * like a document's title which should be displayed with the results. The
  44      * value is stored in its original form, i.e. no analyzer is used before it is
  45      * stored.
  46      */
  47     YES {
  48       @Override
  49       public boolean isStored() { return true; }
  50     },
  51
  52     /** Do not store the field value in the index. */
  53     NO {
  54       @Override
  55       public boolean isStored() { return false; }
  56     };
  57
  58     public abstract boolean isStored();
  59   }
  60
  61   /** Specifies whether and how a field should be indexed. */
  62   public static enum Index {
  63
  64     /** Do not index the field value. This field can thus not be searched,
  65      * but one can still access its contents provided it is
  66      * {@link Field.Store stored}. */
  67     NO {
  68       @Override
  69       public boolean isIndexed()  { return false; }
  70       @Override
  71       public boolean isAnalyzed() { return false; }
  72       @Override
  73       public boolean omitNorms()  { return true;  }
  74     },
  75
  76     /** Index the tokens produced by running the field's
  77      * value through an Analyzer.  This is useful for
  78      * common text. */
  79     ANALYZED {
  80       @Override
  81       public boolean isIndexed()  { return true;  }
  82       @Override
  83       public boolean isAnalyzed() { return true;  }
  84       @Override
  85       public boolean omitNorms()  { return false; }
  86     },
  87
  88     /** Index the field's value without using an Analyzer, so it can be searched.
  89      * As no analyzer is used the value will be stored as a single term. This is
  90      * useful for unique Ids like product numbers.
  91      */
  92     NOT_ANALYZED {
  93       @Override
  94       public boolean isIndexed()  { return true;  }
  95       @Override
  96       public boolean isAnalyzed() { return false; }
  97       @Override
  98       public boolean omitNorms()  { return false; }
  99     },
 100
 101     /** Expert: Index the field's value without an Analyzer,
 102      * and also disable the indexing of norms.  Note that you
 103      * can also separately enable/disable norms by calling
 104      * {@link Field#setOmitNorms}.  No norms means that
 105      * index-time field and document boosting and field
 106      * length normalization are disabled.  The benefit is
 107      * less memory usage as norms take up one byte of RAM
 108      * per indexed field for every document in the index,
 109      * during searching.  Note that once you index a given
 110      * field <i>with</i> norms enabled, disabling norms will
 111      * have no effect.  In other words, for this to have the
 112      * above described effect on a field, all instances of
 113      * that field must be indexed with NOT_ANALYZED_NO_NORMS
 114      * from the beginning. */
 115     NOT_ANALYZED_NO_NORMS {
 116       @Override
 117       public boolean isIndexed()  { return true;  }
 118       @Override
 119       public boolean isAnalyzed() { return false; }
 120       @Override
 121       public boolean omitNorms()  { return true;  }
 122     },
 123
 124     /** Expert: Index the tokens produced by running the
 125      *  field's value through an Analyzer, and also
 126      *  separately disable the storing of norms.  See
 127      *  {@link #NOT_ANALYZED_NO_NORMS} for what norms are
 128      *  and why you may want to disable them. */
 129     ANALYZED_NO_NORMS {
 130       @Override
 131       public boolean isIndexed()  { return true;  }
 132       @Override
 133       public boolean isAnalyzed() { return true;  }
 134       @Override
 135       public boolean omitNorms()  { return true;  }
 136     };
 137
 138     /** Get the best representation of the index given the flags. */
 139     public static Index toIndex(boolean indexed, boolean analyzed) {
 140       return toIndex(indexed, analyzed, false);
 141     }
 142
 143     /** Expert: Get the best representation of the index given the flags. */
 144     public static Index toIndex(boolean indexed, boolean analyzed, boolean omitNorms) {
 145
 146       // If it is not indexed nothing else matters
 147       if (!indexed) {
 148         return Index.NO;
 149       }
 150
 151       // typical, non-expert
 152       if (!omitNorms) {
 153         if (analyzed) {
 154           return Index.ANALYZED;
 155         }
 156         return Index.NOT_ANALYZED;
 157       }
 158
 159       // Expert: Norms omitted
 160       if (analyzed) {
 161         return Index.ANALYZED_NO_NORMS;
 162       }
 163       return Index.NOT_ANALYZED_NO_NORMS;
 164     }
 165
 166     public abstract boolean isIndexed();
 167     public abstract boolean isAnalyzed();
 168     public abstract boolean omitNorms();
 169   }
 170
 171   /** Specifies whether and how a field should have term vectors. */
 172   public static enum TermVector {
 173
 174     /** Do not store term vectors.
 175      */
 176     NO {
 177       @Override
 178       public boolean isStored()      { return false; }
 179       @Override
 180       public boolean withPositions() { return false; }
 181       @Override
 182       public boolean withOffsets()   { return false; }
 183     },
 184
 185     /** Store the term vectors of each document. A term vector is a list
 186      * of the document's terms and their number of occurrences in that document. */
 187     YES {
 188       @Override
 189       public boolean isStored()      { return true;  }
 190       @Override
 191       public boolean withPositions() { return false; }
 192       @Override
 193       public boolean withOffsets()   { return false; }
 194     },
 195
 196     /**
 197      * Store the term vector + token position information
 198      *
 199      * @see #YES
 200      */
 201     WITH_POSITIONS {
 202       @Override
 203       public boolean isStored()      { return true;  }
 204       @Override
 205       public boolean withPositions() { return true;  }
 206       @Override
 207       public boolean withOffsets()   { return false; }
 208     },
 209
 210     /**
 211      * Store the term vector + Token offset information
 212      *
 213      * @see #YES
 214      */
 215     WITH_OFFSETS {
 216       @Override
 217       public boolean isStored()      { return true;  }
 218       @Override
 219       public boolean withPositions() { return false; }
 220       @Override
 221       public boolean withOffsets()   { return true;  }
 222     },
 223
 224     /**
 225      * Store the term vector + Token position and offset information
 226      *
 227      * @see #YES
 228      * @see #WITH_POSITIONS
 229      * @see #WITH_OFFSETS
 230      */
 231     WITH_POSITIONS_OFFSETS {
 232       @Override
 233       public boolean isStored()      { return true;  }
 234       @Override
 235       public boolean withPositions() { return true;  }
 236       @Override
 237       public boolean withOffsets()   { return true;  }
 238     };
 239
 240     /** Get the best representation of a TermVector given the flags. */
 241     public static TermVector toTermVector(boolean stored, boolean withOffsets, boolean withPositions) {
 242
 243       // If it is not stored, nothing else matters.
 244       if (!stored) {
 245         return TermVector.NO;
 246       }
 247
 248       if (withOffsets) {
 249         if (withPositions) {
 250           return Field.TermVector.WITH_POSITIONS_OFFSETS;
 251         }
 252         return Field.TermVector.WITH_OFFSETS;
 253       }
 254
 255       if (withPositions) {
 256         return Field.TermVector.WITH_POSITIONS;
 257       }
 258       return Field.TermVector.YES;
 259     }
 260
 261     public abstract boolean isStored();
 262     public abstract boolean withPositions();
 263     public abstract boolean withOffsets();
 264   }
 265
 266
 267   /** The value of the field as a String, or null.  If null, the Reader value or
 268    * binary value is used.  Exactly one of stringValue(),
 269    * readerValue(), and getBinaryValue() must be set. */
 270   public String stringValue()   { return fieldsData instanceof String ? (String)fieldsData : null; }
 271
 272   /** The value of the field as a Reader, or null.  If null, the String value or
 273    * binary value is used.  Exactly one of stringValue(),
 274    * readerValue(), and getBinaryValue() must be set. */
 275   public Reader readerValue()   { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
 276
 277   /** The TokesStream for this field to be used when indexing, or null.  If null, the Reader value
 278    * or String value is analyzed to produce the indexed tokens. */
 279   public TokenStream tokenStreamValue()   { return tokenStream; }
 280
 281
 282   /** <p>Expert: change the value of this field.  This can
 283    *  be used during indexing to re-use a single Field
 284    *  instance to improve indexing speed by avoiding GC cost
 285    *  of new'ing and reclaiming Field instances.  Typically
 286    *  a single {@link Document} instance is re-used as
 287    *  well.  This helps most on small documents.</p>
 288    *
 289    *  <p>Each Field instance should only be used once
 290    *  within a single {@link Document} instance.  See <a
 291    *  href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">ImproveIndexingSpeed</a>
 292    *  for details.</p> */
 293   public void setValue(String value) {
 294     if (isBinary) {
 295       throw new IllegalArgumentException("cannot set a String value on a binary field");
 296     }
 297     fieldsData = value;
 298   }
 299
 300   /** Expert: change the value of this field.  See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
 301   public void setValue(Reader value) {
 302     if (isBinary) {
 303       throw new IllegalArgumentException("cannot set a Reader value on a binary field");
 304     }
 305     if (isStored) {
 306       throw new IllegalArgumentException("cannot set a Reader value on a stored field");
 307     }
 308     fieldsData = value;
 309   }
 310
 311   /** Expert: change the value of this field.  See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
 312   public void setValue(byte[] value) {
 313     if (!isBinary) {
 314       throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field");
 315     }
 316     fieldsData = value;
 317     binaryLength = value.length;
 318     binaryOffset = 0;
 319   }
 320
 321   /** Expert: change the value of this field.  See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
 322   public void setValue(byte[] value, int offset, int length) {
 323     if (!isBinary) {
 324       throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field");
 325     }
 326     fieldsData = value;
 327     binaryLength = length;
 328     binaryOffset = offset;
 329   }
 330
 331   /** Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true.
 332    *  May be combined with stored values from stringValue() or getBinaryValue() */
 333   public void setTokenStream(TokenStream tokenStream) {
 334     this.isIndexed = true;
 335     this.isTokenized = true;
 336     this.tokenStream = tokenStream;
 337   }
 338
 339   /**
 340    * Create a field by specifying its name, value and how it will
 341    * be saved in the index. Term vectors will not be stored in the index.
 342    *
 343    * @param name The name of the field
 344    * @param value The string to process
 345    * @param store Whether <code>value</code> should be stored in the index
 346    * @param index Whether the field should be indexed, and if so, if it should
 347    *  be tokenized before indexing
 348    * @throws NullPointerException if name or value is <code>null</code>
 349    * @throws IllegalArgumentException if the field is neither stored nor indexed
 350    */
 351   public Field(String name, String value, Store store, Index index) {
 352     this(name, value, store, index, TermVector.NO);
 353   }
 354
 355   /**
 356    * Create a field by specifying its name, value and how it will
 357    * be saved in the index.
 358    *
 359    * @param name The name of the field
 360    * @param value The string to process
 361    * @param store Whether <code>value</code> should be stored in the index
 362    * @param index Whether the field should be indexed, and if so, if it should
 363    *  be tokenized before indexing
 364    * @param termVector Whether term vector should be stored
 365    * @throws NullPointerException if name or value is <code>null</code>
 366    * @throws IllegalArgumentException in any of the following situations:
 367    * <ul>
 368    *  <li>the field is neither stored nor indexed</li>
 369    *  <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
 370    * </ul>
 371    */
 372   public Field(String name, String value, Store store, Index index, TermVector termVector) {
 373     this(name, true, value, store, index, termVector);
 374   }
 375
 376   /**
 377    * Create a field by specifying its name, value and how it will
 378    * be saved in the index.
 379    *
 380    * @param name The name of the field
 381    * @param internName Whether to .intern() name or not
 382    * @param value The string to process
 383    * @param store Whether <code>value</code> should be stored in the index
 384    * @param index Whether the field should be indexed, and if so, if it should
 385    *  be tokenized before indexing
 386    * @param termVector Whether term vector should be stored
 387    * @throws NullPointerException if name or value is <code>null</code>
 388    * @throws IllegalArgumentException in any of the following situations:
 389    * <ul>
 390    *  <li>the field is neither stored nor indexed</li>
 391    *  <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
 392    * </ul>
 393    */
 394   public Field(String name, boolean internName, String value, Store store, Index index, TermVector termVector) {
 395     if (name == null)
 396       throw new NullPointerException("name cannot be null");
 397     if (value == null)
 398       throw new NullPointerException("value cannot be null");
 399     if (name.length() == 0 && value.length() == 0)
 400       throw new IllegalArgumentException("name and value cannot both be empty");
 401     if (index == Index.NO && store == Store.NO)
 402       throw new IllegalArgumentException("it doesn't make sense to have a field that "
 403          + "is neither indexed nor stored");
 404     if (index == Index.NO && termVector != TermVector.NO)
 405       throw new IllegalArgumentException("cannot store term vector information "
 406          + "for a field that is not indexed");
 407
 408     if (internName) // field names are optionally interned
 409       name = StringHelper.intern(name);
 410
 411     this.name = name;
 412
 413     this.fieldsData = value;
 414
 415     this.isStored = store.isStored();
 416
 417     this.isIndexed = index.isIndexed();
 418     this.isTokenized = index.isAnalyzed();
 419     this.omitNorms = index.omitNorms();
 420     if (index == Index.NO) {
 421       // note: now this reads even wierder than before
 422       this.indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
 423     }
 424
 425     this.isBinary = false;
 426
 427     setStoreTermVector(termVector);
 428   }
 429
 430   /**
 431    * Create a tokenized and indexed field that is not stored. Term vectors will
 432    * not be stored.  The Reader is read only when the Document is added to the index,
 433    * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
 434    * has been called.
 435    *
 436    * @param name The name of the field
 437    * @param reader The reader with the content
 438    * @throws NullPointerException if name or reader is <code>null</code>
 439    */
 440   public Field(String name, Reader reader) {
 441     this(name, reader, TermVector.NO);
 442   }
 443
 444   /**
 445    * Create a tokenized and indexed field that is not stored, optionally with
 446    * storing term vectors.  The Reader is read only when the Document is added to the index,
 447    * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
 448    * has been called.
 449    *
 450    * @param name The name of the field
 451    * @param reader The reader with the content
 452    * @param termVector Whether term vector should be stored
 453    * @throws NullPointerException if name or reader is <code>null</code>
 454    */
 455   public Field(String name, Reader reader, TermVector termVector) {
 456     if (name == null)
 457       throw new NullPointerException("name cannot be null");
 458     if (reader == null)
 459       throw new NullPointerException("reader cannot be null");
 460
 461     this.name = StringHelper.intern(name);        // field names are interned
 462     this.fieldsData = reader;
 463
 464     this.isStored = false;
 465
 466     this.isIndexed = true;
 467     this.isTokenized = true;
 468
 469     this.isBinary = false;
 470
 471     setStoreTermVector(termVector);
 472   }
 473
 474   /**
 475    * Create a tokenized and indexed field that is not stored. Term vectors will
 476    * not be stored. This is useful for pre-analyzed fields.
 477    * The TokenStream is read only when the Document is added to the index,
 478    * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
 479    * has been called.
 480    *
 481    * @param name The name of the field
 482    * @param tokenStream The TokenStream with the content
 483    * @throws NullPointerException if name or tokenStream is <code>null</code>
 484    */
 485   public Field(String name, TokenStream tokenStream) {
 486     this(name, tokenStream, TermVector.NO);
 487   }
 488
 489   /**
 490    * Create a tokenized and indexed field that is not stored, optionally with
 491    * storing term vectors.  This is useful for pre-analyzed fields.
 492    * The TokenStream is read only when the Document is added to the index,
 493    * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
 494    * has been called.
 495    *
 496    * @param name The name of the field
 497    * @param tokenStream The TokenStream with the content
 498    * @param termVector Whether term vector should be stored
 499    * @throws NullPointerException if name or tokenStream is <code>null</code>
 500    */
 501   public Field(String name, TokenStream tokenStream, TermVector termVector) {
 502     if (name == null)
 503       throw new NullPointerException("name cannot be null");
 504     if (tokenStream == null)
 505       throw new NullPointerException("tokenStream cannot be null");
 506
 507     this.name = StringHelper.intern(name);        // field names are interned
 508     this.fieldsData = null;
 509     this.tokenStream = tokenStream;
 510
 511     this.isStored = false;
 512
 513     this.isIndexed = true;
 514     this.isTokenized = true;
 515
 516     this.isBinary = false;
 517
 518     setStoreTermVector(termVector);
 519   }
 520
 521
 522   /**
 523    * Create a stored field with binary value. Optionally the value may be compressed.
 524    *
 525    * @param name The name of the field
 526    * @param value The binary value
 527    * @param store Must be Store.YES
 528    * @throws IllegalArgumentException if store is <code>Store.NO</code>
 529    * @deprecated Use {@link #Field(String, byte[]) instead}
 530    */
 531   @Deprecated
 532   public Field(String name, byte[] value, Store store) {
 533     this(name, value, 0, value.length);
 534
 535     if (store == Store.NO) {
 536       throw new IllegalArgumentException("binary values can't be unstored");
 537     }
 538   }
 539
 540   /**
 541    * Create a stored field with binary value. Optionally the value may be compressed.
 542    *
 543    * @param name The name of the field
 544    * @param value The binary value
 545    */
 546   public Field(String name, byte[] value) {
 547     this(name, value, 0, value.length);
 548   }
 549
 550   /**
 551    * Create a stored field with binary value. Optionally the value may be compressed.
 552    *
 553    * @param name The name of the field
 554    * @param value The binary value
 555    * @param offset Starting offset in value where this Field's bytes are
 556    * @param length Number of bytes to use for this Field, starting at offset
 557    * @param store How <code>value</code> should be stored (compressed or not)
 558    * @throws IllegalArgumentException if store is <code>Store.NO</code>
 559    * @deprecated Use {@link #Field(String, byte[], int, int) instead}
 560    */
 561   @Deprecated
 562   public Field(String name, byte[] value, int offset, int length, Store store) {
 563     this(name, value, offset, length);
 564
 565     if (store == Store.NO) {
 566       throw new IllegalArgumentException("binary values can't be unstored");
 567     }
 568   }
 569
 570   /**
 571    * Create a stored field with binary value. Optionally the value may be compressed.
 572    *
 573    * @param name The name of the field
 574    * @param value The binary value
 575    * @param offset Starting offset in value where this Field's bytes are
 576    * @param length Number of bytes to use for this Field, starting at offset
 577    */
 578   public Field(String name, byte[] value, int offset, int length) {
 579
 580     if (name == null)
 581       throw new IllegalArgumentException("name cannot be null");
 582     if (value == null)
 583       throw new IllegalArgumentException("value cannot be null");
 584
 585     this.name = StringHelper.intern(name);        // field names are interned
 586     fieldsData = value;
 587
 588     isStored = true;
 589     isIndexed   = false;
 590     isTokenized = false;
 591     indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
 592     omitNorms = true;
 593
 594     isBinary    = true;
 595     binaryLength = length;
 596     binaryOffset = offset;
 597
 598     setStoreTermVector(TermVector.NO);
 599   }
 600 }