1 package org.apache.lucene.document;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.analysis.TokenStream;
21 import org.apache.lucene.index.IndexWriter; // for javadoc
22 import org.apache.lucene.util.StringHelper;
24 import java.io.Reader;
25 import java.io.Serializable;
27 import org.apache.lucene.index.FieldInfo.IndexOptions;
30 A field is a section of a Document. Each field has two parts, a name and a
31 value. Values may be free text, provided as a String or as a Reader, or they
32 may be atomic keywords, which are not further processed. Such keywords may
33 be used to represent dates, urls, etc. Fields are optionally stored in the
34 index, so that they may be returned with hits on the document.
37 public final class Field extends AbstractField implements Fieldable, Serializable {
39 /** Specifies whether and how a field should be stored. */
40 public static enum Store {
42 /** Store the original field value in the index. This is useful for short texts
43 * like a document's title which should be displayed with the results. The
44 * value is stored in its original form, i.e. no analyzer is used before it is
49 public boolean isStored() { return true; }
52 /** Do not store the field value in the index. */
55 public boolean isStored() { return false; }
58 public abstract boolean isStored();
61 /** Specifies whether and how a field should be indexed. */
62 public static enum Index {
64 /** Do not index the field value. This field can thus not be searched,
65 * but one can still access its contents provided it is
66 * {@link Field.Store stored}. */
69 public boolean isIndexed() { return false; }
71 public boolean isAnalyzed() { return false; }
73 public boolean omitNorms() { return true; }
76 /** Index the tokens produced by running the field's
77 * value through an Analyzer. This is useful for
81 public boolean isIndexed() { return true; }
83 public boolean isAnalyzed() { return true; }
85 public boolean omitNorms() { return false; }
88 /** Index the field's value without using an Analyzer, so it can be searched.
89 * As no analyzer is used the value will be stored as a single term. This is
90 * useful for unique Ids like product numbers.
94 public boolean isIndexed() { return true; }
96 public boolean isAnalyzed() { return false; }
98 public boolean omitNorms() { return false; }
101 /** Expert: Index the field's value without an Analyzer,
102 * and also disable the indexing of norms. Note that you
103 * can also separately enable/disable norms by calling
104 * {@link Field#setOmitNorms}. No norms means that
105 * index-time field and document boosting and field
106 * length normalization are disabled. The benefit is
107 * less memory usage as norms take up one byte of RAM
108 * per indexed field for every document in the index,
109 * during searching. Note that once you index a given
110 * field <i>with</i> norms enabled, disabling norms will
111 * have no effect. In other words, for this to have the
112 * above described effect on a field, all instances of
113 * that field must be indexed with NOT_ANALYZED_NO_NORMS
114 * from the beginning. */
115 NOT_ANALYZED_NO_NORMS {
117 public boolean isIndexed() { return true; }
119 public boolean isAnalyzed() { return false; }
121 public boolean omitNorms() { return true; }
124 /** Expert: Index the tokens produced by running the
125 * field's value through an Analyzer, and also
126 * separately disable the storing of norms. See
127 * {@link #NOT_ANALYZED_NO_NORMS} for what norms are
128 * and why you may want to disable them. */
131 public boolean isIndexed() { return true; }
133 public boolean isAnalyzed() { return true; }
135 public boolean omitNorms() { return true; }
138 /** Get the best representation of the index given the flags. */
139 public static Index toIndex(boolean indexed, boolean analyzed) {
140 return toIndex(indexed, analyzed, false);
143 /** Expert: Get the best representation of the index given the flags. */
144 public static Index toIndex(boolean indexed, boolean analyzed, boolean omitNorms) {
146 // If it is not indexed nothing else matters
151 // typical, non-expert
154 return Index.ANALYZED;
156 return Index.NOT_ANALYZED;
159 // Expert: Norms omitted
161 return Index.ANALYZED_NO_NORMS;
163 return Index.NOT_ANALYZED_NO_NORMS;
166 public abstract boolean isIndexed();
167 public abstract boolean isAnalyzed();
168 public abstract boolean omitNorms();
171 /** Specifies whether and how a field should have term vectors. */
172 public static enum TermVector {
174 /** Do not store term vectors.
178 public boolean isStored() { return false; }
180 public boolean withPositions() { return false; }
182 public boolean withOffsets() { return false; }
185 /** Store the term vectors of each document. A term vector is a list
186 * of the document's terms and their number of occurrences in that document. */
189 public boolean isStored() { return true; }
191 public boolean withPositions() { return false; }
193 public boolean withOffsets() { return false; }
197 * Store the term vector + token position information
203 public boolean isStored() { return true; }
205 public boolean withPositions() { return true; }
207 public boolean withOffsets() { return false; }
211 * Store the term vector + Token offset information
217 public boolean isStored() { return true; }
219 public boolean withPositions() { return false; }
221 public boolean withOffsets() { return true; }
225 * Store the term vector + Token position and offset information
228 * @see #WITH_POSITIONS
231 WITH_POSITIONS_OFFSETS {
233 public boolean isStored() { return true; }
235 public boolean withPositions() { return true; }
237 public boolean withOffsets() { return true; }
240 /** Get the best representation of a TermVector given the flags. */
241 public static TermVector toTermVector(boolean stored, boolean withOffsets, boolean withPositions) {
243 // If it is not stored, nothing else matters.
245 return TermVector.NO;
250 return Field.TermVector.WITH_POSITIONS_OFFSETS;
252 return Field.TermVector.WITH_OFFSETS;
256 return Field.TermVector.WITH_POSITIONS;
258 return Field.TermVector.YES;
261 public abstract boolean isStored();
262 public abstract boolean withPositions();
263 public abstract boolean withOffsets();
267 /** The value of the field as a String, or null. If null, the Reader value or
268 * binary value is used. Exactly one of stringValue(),
269 * readerValue(), and getBinaryValue() must be set. */
270 public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; }
272 /** The value of the field as a Reader, or null. If null, the String value or
273 * binary value is used. Exactly one of stringValue(),
274 * readerValue(), and getBinaryValue() must be set. */
275 public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
277 /** The TokesStream for this field to be used when indexing, or null. If null, the Reader value
278 * or String value is analyzed to produce the indexed tokens. */
279 public TokenStream tokenStreamValue() { return tokenStream; }
282 /** <p>Expert: change the value of this field. This can
283 * be used during indexing to re-use a single Field
284 * instance to improve indexing speed by avoiding GC cost
285 * of new'ing and reclaiming Field instances. Typically
286 * a single {@link Document} instance is re-used as
287 * well. This helps most on small documents.</p>
289 * <p>Each Field instance should only be used once
290 * within a single {@link Document} instance. See <a
291 * href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">ImproveIndexingSpeed</a>
292 * for details.</p> */
293 public void setValue(String value) {
295 throw new IllegalArgumentException("cannot set a String value on a binary field");
300 /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
301 public void setValue(Reader value) {
303 throw new IllegalArgumentException("cannot set a Reader value on a binary field");
306 throw new IllegalArgumentException("cannot set a Reader value on a stored field");
311 /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
312 public void setValue(byte[] value) {
314 throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field");
317 binaryLength = value.length;
321 /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
322 public void setValue(byte[] value, int offset, int length) {
324 throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field");
327 binaryLength = length;
328 binaryOffset = offset;
331 /** Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true.
332 * May be combined with stored values from stringValue() or getBinaryValue() */
333 public void setTokenStream(TokenStream tokenStream) {
334 this.isIndexed = true;
335 this.isTokenized = true;
336 this.tokenStream = tokenStream;
340 * Create a field by specifying its name, value and how it will
341 * be saved in the index. Term vectors will not be stored in the index.
343 * @param name The name of the field
344 * @param value The string to process
345 * @param store Whether <code>value</code> should be stored in the index
346 * @param index Whether the field should be indexed, and if so, if it should
347 * be tokenized before indexing
348 * @throws NullPointerException if name or value is <code>null</code>
349 * @throws IllegalArgumentException if the field is neither stored nor indexed
351 public Field(String name, String value, Store store, Index index) {
352 this(name, value, store, index, TermVector.NO);
356 * Create a field by specifying its name, value and how it will
357 * be saved in the index.
359 * @param name The name of the field
360 * @param value The string to process
361 * @param store Whether <code>value</code> should be stored in the index
362 * @param index Whether the field should be indexed, and if so, if it should
363 * be tokenized before indexing
364 * @param termVector Whether term vector should be stored
365 * @throws NullPointerException if name or value is <code>null</code>
366 * @throws IllegalArgumentException in any of the following situations:
368 * <li>the field is neither stored nor indexed</li>
369 * <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
372 public Field(String name, String value, Store store, Index index, TermVector termVector) {
373 this(name, true, value, store, index, termVector);
377 * Create a field by specifying its name, value and how it will
378 * be saved in the index.
380 * @param name The name of the field
381 * @param internName Whether to .intern() name or not
382 * @param value The string to process
383 * @param store Whether <code>value</code> should be stored in the index
384 * @param index Whether the field should be indexed, and if so, if it should
385 * be tokenized before indexing
386 * @param termVector Whether term vector should be stored
387 * @throws NullPointerException if name or value is <code>null</code>
388 * @throws IllegalArgumentException in any of the following situations:
390 * <li>the field is neither stored nor indexed</li>
391 * <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
394 public Field(String name, boolean internName, String value, Store store, Index index, TermVector termVector) {
396 throw new NullPointerException("name cannot be null");
398 throw new NullPointerException("value cannot be null");
399 if (name.length() == 0 && value.length() == 0)
400 throw new IllegalArgumentException("name and value cannot both be empty");
401 if (index == Index.NO && store == Store.NO)
402 throw new IllegalArgumentException("it doesn't make sense to have a field that "
403 + "is neither indexed nor stored");
404 if (index == Index.NO && termVector != TermVector.NO)
405 throw new IllegalArgumentException("cannot store term vector information "
406 + "for a field that is not indexed");
408 if (internName) // field names are optionally interned
409 name = StringHelper.intern(name);
413 this.fieldsData = value;
415 this.isStored = store.isStored();
417 this.isIndexed = index.isIndexed();
418 this.isTokenized = index.isAnalyzed();
419 this.omitNorms = index.omitNorms();
420 if (index == Index.NO) {
421 // note: now this reads even wierder than before
422 this.indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
425 this.isBinary = false;
427 setStoreTermVector(termVector);
431 * Create a tokenized and indexed field that is not stored. Term vectors will
432 * not be stored. The Reader is read only when the Document is added to the index,
433 * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
436 * @param name The name of the field
437 * @param reader The reader with the content
438 * @throws NullPointerException if name or reader is <code>null</code>
440 public Field(String name, Reader reader) {
441 this(name, reader, TermVector.NO);
445 * Create a tokenized and indexed field that is not stored, optionally with
446 * storing term vectors. The Reader is read only when the Document is added to the index,
447 * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
450 * @param name The name of the field
451 * @param reader The reader with the content
452 * @param termVector Whether term vector should be stored
453 * @throws NullPointerException if name or reader is <code>null</code>
455 public Field(String name, Reader reader, TermVector termVector) {
457 throw new NullPointerException("name cannot be null");
459 throw new NullPointerException("reader cannot be null");
461 this.name = StringHelper.intern(name); // field names are interned
462 this.fieldsData = reader;
464 this.isStored = false;
466 this.isIndexed = true;
467 this.isTokenized = true;
469 this.isBinary = false;
471 setStoreTermVector(termVector);
475 * Create a tokenized and indexed field that is not stored. Term vectors will
476 * not be stored. This is useful for pre-analyzed fields.
477 * The TokenStream is read only when the Document is added to the index,
478 * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
481 * @param name The name of the field
482 * @param tokenStream The TokenStream with the content
483 * @throws NullPointerException if name or tokenStream is <code>null</code>
485 public Field(String name, TokenStream tokenStream) {
486 this(name, tokenStream, TermVector.NO);
490 * Create a tokenized and indexed field that is not stored, optionally with
491 * storing term vectors. This is useful for pre-analyzed fields.
492 * The TokenStream is read only when the Document is added to the index,
493 * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
496 * @param name The name of the field
497 * @param tokenStream The TokenStream with the content
498 * @param termVector Whether term vector should be stored
499 * @throws NullPointerException if name or tokenStream is <code>null</code>
501 public Field(String name, TokenStream tokenStream, TermVector termVector) {
503 throw new NullPointerException("name cannot be null");
504 if (tokenStream == null)
505 throw new NullPointerException("tokenStream cannot be null");
507 this.name = StringHelper.intern(name); // field names are interned
508 this.fieldsData = null;
509 this.tokenStream = tokenStream;
511 this.isStored = false;
513 this.isIndexed = true;
514 this.isTokenized = true;
516 this.isBinary = false;
518 setStoreTermVector(termVector);
523 * Create a stored field with binary value. Optionally the value may be compressed.
525 * @param name The name of the field
526 * @param value The binary value
527 * @param store Must be Store.YES
528 * @throws IllegalArgumentException if store is <code>Store.NO</code>
529 * @deprecated Use {@link #Field(String, byte[]) instead}
532 public Field(String name, byte[] value, Store store) {
533 this(name, value, 0, value.length);
535 if (store == Store.NO) {
536 throw new IllegalArgumentException("binary values can't be unstored");
541 * Create a stored field with binary value. Optionally the value may be compressed.
543 * @param name The name of the field
544 * @param value The binary value
546 public Field(String name, byte[] value) {
547 this(name, value, 0, value.length);
551 * Create a stored field with binary value. Optionally the value may be compressed.
553 * @param name The name of the field
554 * @param value The binary value
555 * @param offset Starting offset in value where this Field's bytes are
556 * @param length Number of bytes to use for this Field, starting at offset
557 * @param store How <code>value</code> should be stored (compressed or not)
558 * @throws IllegalArgumentException if store is <code>Store.NO</code>
559 * @deprecated Use {@link #Field(String, byte[], int, int) instead}
562 public Field(String name, byte[] value, int offset, int length, Store store) {
563 this(name, value, offset, length);
565 if (store == Store.NO) {
566 throw new IllegalArgumentException("binary values can't be unstored");
571 * Create a stored field with binary value. Optionally the value may be compressed.
573 * @param name The name of the field
574 * @param value The binary value
575 * @param offset Starting offset in value where this Field's bytes are
576 * @param length Number of bytes to use for this Field, starting at offset
578 public Field(String name, byte[] value, int offset, int length) {
581 throw new IllegalArgumentException("name cannot be null");
583 throw new IllegalArgumentException("value cannot be null");
585 this.name = StringHelper.intern(name); // field names are interned
591 indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
595 binaryLength = length;
596 binaryOffset = offset;
598 setStoreTermVector(TermVector.NO);