1 package org.apache.lucene.document;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.analysis.TokenStream;
21 import org.apache.lucene.index.IndexWriter; // for javadoc
22 import org.apache.lucene.util.StringHelper;
24 import java.io.Reader;
25 import java.io.Serializable;
27 import org.apache.lucene.index.FieldInfo.IndexOptions;
30 A field is a section of a Document. Each field has two parts, a name and a
31 value. Values may be free text, provided as a String or as a Reader, or they
32 may be atomic keywords, which are not further processed. Such keywords may
33 be used to represent dates, urls, etc. Fields are optionally stored in the
34 index, so that they may be returned with hits on the document.
37 public final class Field extends AbstractField implements Fieldable, Serializable {
39 /** Specifies whether and how a field should be stored. */
40 public static enum Store {
42 /** Store the original field value in the index. This is useful for short texts
43 * like a document's title which should be displayed with the results. The
44 * value is stored in its original form, i.e. no analyzer is used before it is
49 public boolean isStored() { return true; }
52 /** Do not store the field value in the index. */
55 public boolean isStored() { return false; }
58 public abstract boolean isStored();
61 /** Specifies whether and how a field should be indexed. */
62 public static enum Index {
64 /** Do not index the field value. This field can thus not be searched,
65 * but one can still access its contents provided it is
66 * {@link Field.Store stored}. */
69 public boolean isIndexed() { return false; }
71 public boolean isAnalyzed() { return false; }
73 public boolean omitNorms() { return true; }
76 /** Index the tokens produced by running the field's
77 * value through an Analyzer. This is useful for
81 public boolean isIndexed() { return true; }
83 public boolean isAnalyzed() { return true; }
85 public boolean omitNorms() { return false; }
88 /** Index the field's value without using an Analyzer, so it can be searched.
89 * As no analyzer is used the value will be stored as a single term. This is
90 * useful for unique Ids like product numbers.
94 public boolean isIndexed() { return true; }
96 public boolean isAnalyzed() { return false; }
98 public boolean omitNorms() { return false; }
101 /** Expert: Index the field's value without an Analyzer,
102 * and also disable the indexing of norms. Note that you
103 * can also separately enable/disable norms by calling
104 * {@link Field#setOmitNorms}. No norms means that
105 * index-time field and document boosting and field
106 * length normalization are disabled. The benefit is
107 * less memory usage as norms take up one byte of RAM
108 * per indexed field for every document in the index,
109 * during searching. Note that once you index a given
110 * field <i>with</i> norms enabled, disabling norms will
111 * have no effect. In other words, for this to have the
112 * above described effect on a field, all instances of
113 * that field must be indexed with NOT_ANALYZED_NO_NORMS
114 * from the beginning. */
115 NOT_ANALYZED_NO_NORMS {
117 public boolean isIndexed() { return true; }
119 public boolean isAnalyzed() { return false; }
121 public boolean omitNorms() { return true; }
124 /** Expert: Index the tokens produced by running the
125 * field's value through an Analyzer, and also
126 * separately disable the storing of norms. See
127 * {@link #NOT_ANALYZED_NO_NORMS} for what norms are
128 * and why you may want to disable them. */
131 public boolean isIndexed() { return true; }
133 public boolean isAnalyzed() { return true; }
135 public boolean omitNorms() { return true; }
138 /** Get the best representation of the index given the flags. */
139 public static Index toIndex(boolean indexed, boolean analyzed) {
140 return toIndex(indexed, analyzed, false);
143 /** Expert: Get the best representation of the index given the flags. */
144 public static Index toIndex(boolean indexed, boolean analyzed, boolean omitNorms) {
146 // If it is not indexed nothing else matters
151 // typical, non-expert
154 return Index.ANALYZED;
156 return Index.NOT_ANALYZED;
159 // Expert: Norms omitted
161 return Index.ANALYZED_NO_NORMS;
163 return Index.NOT_ANALYZED_NO_NORMS;
166 public abstract boolean isIndexed();
167 public abstract boolean isAnalyzed();
168 public abstract boolean omitNorms();
171 /** Specifies whether and how a field should have term vectors. */
172 public static enum TermVector {
174 /** Do not store term vectors.
178 public boolean isStored() { return false; }
180 public boolean withPositions() { return false; }
182 public boolean withOffsets() { return false; }
185 /** Store the term vectors of each document. A term vector is a list
186 * of the document's terms and their number of occurrences in that document. */
189 public boolean isStored() { return true; }
191 public boolean withPositions() { return false; }
193 public boolean withOffsets() { return false; }
197 * Store the term vector + token position information
203 public boolean isStored() { return true; }
205 public boolean withPositions() { return true; }
207 public boolean withOffsets() { return false; }
211 * Store the term vector + Token offset information
217 public boolean isStored() { return true; }
219 public boolean withPositions() { return false; }
221 public boolean withOffsets() { return true; }
225 * Store the term vector + Token position and offset information
228 * @see #WITH_POSITIONS
231 WITH_POSITIONS_OFFSETS {
233 public boolean isStored() { return true; }
235 public boolean withPositions() { return true; }
237 public boolean withOffsets() { return true; }
240 /** Get the best representation of a TermVector given the flags. */
241 public static TermVector toTermVector(boolean stored, boolean withOffsets, boolean withPositions) {
243 // If it is not stored, nothing else matters.
245 return TermVector.NO;
250 return Field.TermVector.WITH_POSITIONS_OFFSETS;
252 return Field.TermVector.WITH_OFFSETS;
256 return Field.TermVector.WITH_POSITIONS;
258 return Field.TermVector.YES;
261 public abstract boolean isStored();
262 public abstract boolean withPositions();
263 public abstract boolean withOffsets();
267 /** The value of the field as a String, or null. If null, the Reader value or
268 * binary value is used. Exactly one of stringValue(),
269 * readerValue(), and getBinaryValue() must be set. */
270 public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; }
272 /** The value of the field as a Reader, or null. If null, the String value or
273 * binary value is used. Exactly one of stringValue(),
274 * readerValue(), and getBinaryValue() must be set. */
275 public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
277 /** The TokesStream for this field to be used when indexing, or null. If null, the Reader value
278 * or String value is analyzed to produce the indexed tokens. */
279 public TokenStream tokenStreamValue() { return tokenStream; }
282 /** <p>Expert: change the value of this field. This can
283 * be used during indexing to re-use a single Field
284 * instance to improve indexing speed by avoiding GC cost
285 * of new'ing and reclaiming Field instances. Typically
286 * a single {@link Document} instance is re-used as
287 * well. This helps most on small documents.</p>
289 * <p>Each Field instance should only be used once
290 * within a single {@link Document} instance. See <a
291 * href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">ImproveIndexingSpeed</a>
292 * for details.</p> */
293 public void setValue(String value) {
295 throw new IllegalArgumentException("cannot set a String value on a binary field");
300 /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
301 public void setValue(Reader value) {
303 throw new IllegalArgumentException("cannot set a Reader value on a binary field");
306 throw new IllegalArgumentException("cannot set a Reader value on a stored field");
311 /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
312 public void setValue(byte[] value) {
314 throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field");
317 binaryLength = value.length;
321 /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
322 public void setValue(byte[] value, int offset, int length) {
324 throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field");
327 binaryLength = length;
328 binaryOffset = offset;
331 /** Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true.
332 * May be combined with stored values from stringValue() or getBinaryValue() */
333 public void setTokenStream(TokenStream tokenStream) {
334 this.isIndexed = true;
335 this.isTokenized = true;
336 this.tokenStream = tokenStream;
340 * Create a field by specifying its name, value and how it will
341 * be saved in the index. Term vectors will not be stored in the index.
343 * @param name The name of the field
344 * @param value The string to process
345 * @param store Whether <code>value</code> should be stored in the index
346 * @param index Whether the field should be indexed, and if so, if it should
347 * be tokenized before indexing
348 * @throws NullPointerException if name or value is <code>null</code>
349 * @throws IllegalArgumentException if the field is neither stored nor indexed
351 public Field(String name, String value, Store store, Index index) {
352 this(name, value, store, index, TermVector.NO);
356 * Create a field by specifying its name, value and how it will
357 * be saved in the index.
359 * @param name The name of the field
360 * @param value The string to process
361 * @param store Whether <code>value</code> should be stored in the index
362 * @param index Whether the field should be indexed, and if so, if it should
363 * be tokenized before indexing
364 * @param termVector Whether term vector should be stored
365 * @throws NullPointerException if name or value is <code>null</code>
366 * @throws IllegalArgumentException in any of the following situations:
368 * <li>the field is neither stored nor indexed</li>
369 * <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
372 public Field(String name, String value, Store store, Index index, TermVector termVector) {
373 this(name, true, value, store, index, termVector);
377 * Create a field by specifying its name, value and how it will
378 * be saved in the index.
380 * @param name The name of the field
381 * @param internName Whether to .intern() name or not
382 * @param value The string to process
383 * @param store Whether <code>value</code> should be stored in the index
384 * @param index Whether the field should be indexed, and if so, if it should
385 * be tokenized before indexing
386 * @param termVector Whether term vector should be stored
387 * @throws NullPointerException if name or value is <code>null</code>
388 * @throws IllegalArgumentException in any of the following situations:
390 * <li>the field is neither stored nor indexed</li>
391 * <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
394 public Field(String name, boolean internName, String value, Store store, Index index, TermVector termVector) {
396 throw new NullPointerException("name cannot be null");
398 throw new NullPointerException("value cannot be null");
399 if (index == Index.NO && store == Store.NO)
400 throw new IllegalArgumentException("it doesn't make sense to have a field that "
401 + "is neither indexed nor stored");
402 if (index == Index.NO && termVector != TermVector.NO)
403 throw new IllegalArgumentException("cannot store term vector information "
404 + "for a field that is not indexed");
406 if (internName) // field names are optionally interned
407 name = StringHelper.intern(name);
411 this.fieldsData = value;
413 this.isStored = store.isStored();
415 this.isIndexed = index.isIndexed();
416 this.isTokenized = index.isAnalyzed();
417 this.omitNorms = index.omitNorms();
418 if (index == Index.NO) {
419 // note: now this reads even wierder than before
420 this.indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
423 this.isBinary = false;
425 setStoreTermVector(termVector);
429 * Create a tokenized and indexed field that is not stored. Term vectors will
430 * not be stored. The Reader is read only when the Document is added to the index,
431 * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
434 * @param name The name of the field
435 * @param reader The reader with the content
436 * @throws NullPointerException if name or reader is <code>null</code>
438 public Field(String name, Reader reader) {
439 this(name, reader, TermVector.NO);
443 * Create a tokenized and indexed field that is not stored, optionally with
444 * storing term vectors. The Reader is read only when the Document is added to the index,
445 * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
448 * @param name The name of the field
449 * @param reader The reader with the content
450 * @param termVector Whether term vector should be stored
451 * @throws NullPointerException if name or reader is <code>null</code>
453 public Field(String name, Reader reader, TermVector termVector) {
455 throw new NullPointerException("name cannot be null");
457 throw new NullPointerException("reader cannot be null");
459 this.name = StringHelper.intern(name); // field names are interned
460 this.fieldsData = reader;
462 this.isStored = false;
464 this.isIndexed = true;
465 this.isTokenized = true;
467 this.isBinary = false;
469 setStoreTermVector(termVector);
473 * Create a tokenized and indexed field that is not stored. Term vectors will
474 * not be stored. This is useful for pre-analyzed fields.
475 * The TokenStream is read only when the Document is added to the index,
476 * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
479 * @param name The name of the field
480 * @param tokenStream The TokenStream with the content
481 * @throws NullPointerException if name or tokenStream is <code>null</code>
483 public Field(String name, TokenStream tokenStream) {
484 this(name, tokenStream, TermVector.NO);
488 * Create a tokenized and indexed field that is not stored, optionally with
489 * storing term vectors. This is useful for pre-analyzed fields.
490 * The TokenStream is read only when the Document is added to the index,
491 * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
494 * @param name The name of the field
495 * @param tokenStream The TokenStream with the content
496 * @param termVector Whether term vector should be stored
497 * @throws NullPointerException if name or tokenStream is <code>null</code>
499 public Field(String name, TokenStream tokenStream, TermVector termVector) {
501 throw new NullPointerException("name cannot be null");
502 if (tokenStream == null)
503 throw new NullPointerException("tokenStream cannot be null");
505 this.name = StringHelper.intern(name); // field names are interned
506 this.fieldsData = null;
507 this.tokenStream = tokenStream;
509 this.isStored = false;
511 this.isIndexed = true;
512 this.isTokenized = true;
514 this.isBinary = false;
516 setStoreTermVector(termVector);
521 * Create a stored field with binary value. Optionally the value may be compressed.
523 * @param name The name of the field
524 * @param value The binary value
525 * @param store Must be Store.YES
526 * @throws IllegalArgumentException if store is <code>Store.NO</code>
527 * @deprecated Use {@link #Field(String, byte[]) instead}
530 public Field(String name, byte[] value, Store store) {
531 this(name, value, 0, value.length);
533 if (store == Store.NO) {
534 throw new IllegalArgumentException("binary values can't be unstored");
539 * Create a stored field with binary value. Optionally the value may be compressed.
541 * @param name The name of the field
542 * @param value The binary value
544 public Field(String name, byte[] value) {
545 this(name, value, 0, value.length);
549 * Create a stored field with binary value. Optionally the value may be compressed.
551 * @param name The name of the field
552 * @param value The binary value
553 * @param offset Starting offset in value where this Field's bytes are
554 * @param length Number of bytes to use for this Field, starting at offset
555 * @param store How <code>value</code> should be stored (compressed or not)
556 * @throws IllegalArgumentException if store is <code>Store.NO</code>
557 * @deprecated Use {@link #Field(String, byte[], int, int) instead}
560 public Field(String name, byte[] value, int offset, int length, Store store) {
561 this(name, value, offset, length);
563 if (store == Store.NO) {
564 throw new IllegalArgumentException("binary values can't be unstored");
569 * Create a stored field with binary value. Optionally the value may be compressed.
571 * @param name The name of the field
572 * @param value The binary value
573 * @param offset Starting offset in value where this Field's bytes are
574 * @param length Number of bytes to use for this Field, starting at offset
576 public Field(String name, byte[] value, int offset, int length) {
579 throw new IllegalArgumentException("name cannot be null");
581 throw new IllegalArgumentException("value cannot be null");
583 this.name = StringHelper.intern(name); // field names are interned
589 indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
593 binaryLength = length;
594 binaryOffset = offset;
596 setStoreTermVector(TermVector.NO);