X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/document/Field.java diff --git a/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/document/Field.java b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/document/Field.java new file mode 100644 index 0000000..9f89223 --- /dev/null +++ b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/document/Field.java @@ -0,0 +1,598 @@ +package org.apache.lucene.document; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.IndexWriter; // for javadoc +import org.apache.lucene.util.StringHelper; + +import java.io.Reader; +import java.io.Serializable; + +import org.apache.lucene.index.FieldInfo.IndexOptions; + +/** + A field is a section of a Document. Each field has two parts, a name and a + value. Values may be free text, provided as a String or as a Reader, or they + may be atomic keywords, which are not further processed. Such keywords may + be used to represent dates, urls, etc. Fields are optionally stored in the + index, so that they may be returned with hits on the document. + */ + +public final class Field extends AbstractField implements Fieldable, Serializable { + + /** Specifies whether and how a field should be stored. */ + public static enum Store { + + /** Store the original field value in the index. This is useful for short texts + * like a document's title which should be displayed with the results. The + * value is stored in its original form, i.e. no analyzer is used before it is + * stored. + */ + YES { + @Override + public boolean isStored() { return true; } + }, + + /** Do not store the field value in the index. */ + NO { + @Override + public boolean isStored() { return false; } + }; + + public abstract boolean isStored(); + } + + /** Specifies whether and how a field should be indexed. */ + public static enum Index { + + /** Do not index the field value. This field can thus not be searched, + * but one can still access its contents provided it is + * {@link Field.Store stored}. */ + NO { + @Override + public boolean isIndexed() { return false; } + @Override + public boolean isAnalyzed() { return false; } + @Override + public boolean omitNorms() { return true; } + }, + + /** Index the tokens produced by running the field's + * value through an Analyzer. This is useful for + * common text. */ + ANALYZED { + @Override + public boolean isIndexed() { return true; } + @Override + public boolean isAnalyzed() { return true; } + @Override + public boolean omitNorms() { return false; } + }, + + /** Index the field's value without using an Analyzer, so it can be searched. + * As no analyzer is used the value will be stored as a single term. This is + * useful for unique Ids like product numbers. + */ + NOT_ANALYZED { + @Override + public boolean isIndexed() { return true; } + @Override + public boolean isAnalyzed() { return false; } + @Override + public boolean omitNorms() { return false; } + }, + + /** Expert: Index the field's value without an Analyzer, + * and also disable the indexing of norms. Note that you + * can also separately enable/disable norms by calling + * {@link Field#setOmitNorms}. No norms means that + * index-time field and document boosting and field + * length normalization are disabled. The benefit is + * less memory usage as norms take up one byte of RAM + * per indexed field for every document in the index, + * during searching. Note that once you index a given + * field with norms enabled, disabling norms will + * have no effect. In other words, for this to have the + * above described effect on a field, all instances of + * that field must be indexed with NOT_ANALYZED_NO_NORMS + * from the beginning. */ + NOT_ANALYZED_NO_NORMS { + @Override + public boolean isIndexed() { return true; } + @Override + public boolean isAnalyzed() { return false; } + @Override + public boolean omitNorms() { return true; } + }, + + /** Expert: Index the tokens produced by running the + * field's value through an Analyzer, and also + * separately disable the storing of norms. See + * {@link #NOT_ANALYZED_NO_NORMS} for what norms are + * and why you may want to disable them. */ + ANALYZED_NO_NORMS { + @Override + public boolean isIndexed() { return true; } + @Override + public boolean isAnalyzed() { return true; } + @Override + public boolean omitNorms() { return true; } + }; + + /** Get the best representation of the index given the flags. */ + public static Index toIndex(boolean indexed, boolean analyzed) { + return toIndex(indexed, analyzed, false); + } + + /** Expert: Get the best representation of the index given the flags. */ + public static Index toIndex(boolean indexed, boolean analyzed, boolean omitNorms) { + + // If it is not indexed nothing else matters + if (!indexed) { + return Index.NO; + } + + // typical, non-expert + if (!omitNorms) { + if (analyzed) { + return Index.ANALYZED; + } + return Index.NOT_ANALYZED; + } + + // Expert: Norms omitted + if (analyzed) { + return Index.ANALYZED_NO_NORMS; + } + return Index.NOT_ANALYZED_NO_NORMS; + } + + public abstract boolean isIndexed(); + public abstract boolean isAnalyzed(); + public abstract boolean omitNorms(); + } + + /** Specifies whether and how a field should have term vectors. */ + public static enum TermVector { + + /** Do not store term vectors. + */ + NO { + @Override + public boolean isStored() { return false; } + @Override + public boolean withPositions() { return false; } + @Override + public boolean withOffsets() { return false; } + }, + + /** Store the term vectors of each document. A term vector is a list + * of the document's terms and their number of occurrences in that document. */ + YES { + @Override + public boolean isStored() { return true; } + @Override + public boolean withPositions() { return false; } + @Override + public boolean withOffsets() { return false; } + }, + + /** + * Store the term vector + token position information + * + * @see #YES + */ + WITH_POSITIONS { + @Override + public boolean isStored() { return true; } + @Override + public boolean withPositions() { return true; } + @Override + public boolean withOffsets() { return false; } + }, + + /** + * Store the term vector + Token offset information + * + * @see #YES + */ + WITH_OFFSETS { + @Override + public boolean isStored() { return true; } + @Override + public boolean withPositions() { return false; } + @Override + public boolean withOffsets() { return true; } + }, + + /** + * Store the term vector + Token position and offset information + * + * @see #YES + * @see #WITH_POSITIONS + * @see #WITH_OFFSETS + */ + WITH_POSITIONS_OFFSETS { + @Override + public boolean isStored() { return true; } + @Override + public boolean withPositions() { return true; } + @Override + public boolean withOffsets() { return true; } + }; + + /** Get the best representation of a TermVector given the flags. */ + public static TermVector toTermVector(boolean stored, boolean withOffsets, boolean withPositions) { + + // If it is not stored, nothing else matters. + if (!stored) { + return TermVector.NO; + } + + if (withOffsets) { + if (withPositions) { + return Field.TermVector.WITH_POSITIONS_OFFSETS; + } + return Field.TermVector.WITH_OFFSETS; + } + + if (withPositions) { + return Field.TermVector.WITH_POSITIONS; + } + return Field.TermVector.YES; + } + + public abstract boolean isStored(); + public abstract boolean withPositions(); + public abstract boolean withOffsets(); + } + + + /** The value of the field as a String, or null. If null, the Reader value or + * binary value is used. Exactly one of stringValue(), + * readerValue(), and getBinaryValue() must be set. */ + public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; } + + /** The value of the field as a Reader, or null. If null, the String value or + * binary value is used. Exactly one of stringValue(), + * readerValue(), and getBinaryValue() must be set. */ + public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; } + + /** The TokesStream for this field to be used when indexing, or null. If null, the Reader value + * or String value is analyzed to produce the indexed tokens. */ + public TokenStream tokenStreamValue() { return tokenStream; } + + + /**

Expert: change the value of this field. This can + * be used during indexing to re-use a single Field + * instance to improve indexing speed by avoiding GC cost + * of new'ing and reclaiming Field instances. Typically + * a single {@link Document} instance is re-used as + * well. This helps most on small documents.

+ * + *

Each Field instance should only be used once + * within a single {@link Document} instance. See ImproveIndexingSpeed + * for details.

*/ + public void setValue(String value) { + if (isBinary) { + throw new IllegalArgumentException("cannot set a String value on a binary field"); + } + fieldsData = value; + } + + /** Expert: change the value of this field. See setValue(String). */ + public void setValue(Reader value) { + if (isBinary) { + throw new IllegalArgumentException("cannot set a Reader value on a binary field"); + } + if (isStored) { + throw new IllegalArgumentException("cannot set a Reader value on a stored field"); + } + fieldsData = value; + } + + /** Expert: change the value of this field. See setValue(String). */ + public void setValue(byte[] value) { + if (!isBinary) { + throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field"); + } + fieldsData = value; + binaryLength = value.length; + binaryOffset = 0; + } + + /** Expert: change the value of this field. See setValue(String). */ + public void setValue(byte[] value, int offset, int length) { + if (!isBinary) { + throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field"); + } + fieldsData = value; + binaryLength = length; + binaryOffset = offset; + } + + /** Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true. + * May be combined with stored values from stringValue() or getBinaryValue() */ + public void setTokenStream(TokenStream tokenStream) { + this.isIndexed = true; + this.isTokenized = true; + this.tokenStream = tokenStream; + } + + /** + * Create a field by specifying its name, value and how it will + * be saved in the index. Term vectors will not be stored in the index. + * + * @param name The name of the field + * @param value The string to process + * @param store Whether value should be stored in the index + * @param index Whether the field should be indexed, and if so, if it should + * be tokenized before indexing + * @throws NullPointerException if name or value is null + * @throws IllegalArgumentException if the field is neither stored nor indexed + */ + public Field(String name, String value, Store store, Index index) { + this(name, value, store, index, TermVector.NO); + } + + /** + * Create a field by specifying its name, value and how it will + * be saved in the index. + * + * @param name The name of the field + * @param value The string to process + * @param store Whether value should be stored in the index + * @param index Whether the field should be indexed, and if so, if it should + * be tokenized before indexing + * @param termVector Whether term vector should be stored + * @throws NullPointerException if name or value is null + * @throws IllegalArgumentException in any of the following situations: + * + */ + public Field(String name, String value, Store store, Index index, TermVector termVector) { + this(name, true, value, store, index, termVector); + } + + /** + * Create a field by specifying its name, value and how it will + * be saved in the index. + * + * @param name The name of the field + * @param internName Whether to .intern() name or not + * @param value The string to process + * @param store Whether value should be stored in the index + * @param index Whether the field should be indexed, and if so, if it should + * be tokenized before indexing + * @param termVector Whether term vector should be stored + * @throws NullPointerException if name or value is null + * @throws IllegalArgumentException in any of the following situations: + * + */ + public Field(String name, boolean internName, String value, Store store, Index index, TermVector termVector) { + if (name == null) + throw new NullPointerException("name cannot be null"); + if (value == null) + throw new NullPointerException("value cannot be null"); + if (index == Index.NO && store == Store.NO) + throw new IllegalArgumentException("it doesn't make sense to have a field that " + + "is neither indexed nor stored"); + if (index == Index.NO && termVector != TermVector.NO) + throw new IllegalArgumentException("cannot store term vector information " + + "for a field that is not indexed"); + + if (internName) // field names are optionally interned + name = StringHelper.intern(name); + + this.name = name; + + this.fieldsData = value; + + this.isStored = store.isStored(); + + this.isIndexed = index.isIndexed(); + this.isTokenized = index.isAnalyzed(); + this.omitNorms = index.omitNorms(); + if (index == Index.NO) { + // note: now this reads even wierder than before + this.indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; + } + + this.isBinary = false; + + setStoreTermVector(termVector); + } + + /** + * Create a tokenized and indexed field that is not stored. Term vectors will + * not be stored. The Reader is read only when the Document is added to the index, + * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)} + * has been called. + * + * @param name The name of the field + * @param reader The reader with the content + * @throws NullPointerException if name or reader is null + */ + public Field(String name, Reader reader) { + this(name, reader, TermVector.NO); + } + + /** + * Create a tokenized and indexed field that is not stored, optionally with + * storing term vectors. The Reader is read only when the Document is added to the index, + * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)} + * has been called. + * + * @param name The name of the field + * @param reader The reader with the content + * @param termVector Whether term vector should be stored + * @throws NullPointerException if name or reader is null + */ + public Field(String name, Reader reader, TermVector termVector) { + if (name == null) + throw new NullPointerException("name cannot be null"); + if (reader == null) + throw new NullPointerException("reader cannot be null"); + + this.name = StringHelper.intern(name); // field names are interned + this.fieldsData = reader; + + this.isStored = false; + + this.isIndexed = true; + this.isTokenized = true; + + this.isBinary = false; + + setStoreTermVector(termVector); + } + + /** + * Create a tokenized and indexed field that is not stored. Term vectors will + * not be stored. This is useful for pre-analyzed fields. + * The TokenStream is read only when the Document is added to the index, + * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)} + * has been called. + * + * @param name The name of the field + * @param tokenStream The TokenStream with the content + * @throws NullPointerException if name or tokenStream is null + */ + public Field(String name, TokenStream tokenStream) { + this(name, tokenStream, TermVector.NO); + } + + /** + * Create a tokenized and indexed field that is not stored, optionally with + * storing term vectors. This is useful for pre-analyzed fields. + * The TokenStream is read only when the Document is added to the index, + * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)} + * has been called. + * + * @param name The name of the field + * @param tokenStream The TokenStream with the content + * @param termVector Whether term vector should be stored + * @throws NullPointerException if name or tokenStream is null + */ + public Field(String name, TokenStream tokenStream, TermVector termVector) { + if (name == null) + throw new NullPointerException("name cannot be null"); + if (tokenStream == null) + throw new NullPointerException("tokenStream cannot be null"); + + this.name = StringHelper.intern(name); // field names are interned + this.fieldsData = null; + this.tokenStream = tokenStream; + + this.isStored = false; + + this.isIndexed = true; + this.isTokenized = true; + + this.isBinary = false; + + setStoreTermVector(termVector); + } + + + /** + * Create a stored field with binary value. Optionally the value may be compressed. + * + * @param name The name of the field + * @param value The binary value + * @param store Must be Store.YES + * @throws IllegalArgumentException if store is Store.NO + * @deprecated Use {@link #Field(String, byte[]) instead} + */ + @Deprecated + public Field(String name, byte[] value, Store store) { + this(name, value, 0, value.length); + + if (store == Store.NO) { + throw new IllegalArgumentException("binary values can't be unstored"); + } + } + + /** + * Create a stored field with binary value. Optionally the value may be compressed. + * + * @param name The name of the field + * @param value The binary value + */ + public Field(String name, byte[] value) { + this(name, value, 0, value.length); + } + + /** + * Create a stored field with binary value. Optionally the value may be compressed. + * + * @param name The name of the field + * @param value The binary value + * @param offset Starting offset in value where this Field's bytes are + * @param length Number of bytes to use for this Field, starting at offset + * @param store How value should be stored (compressed or not) + * @throws IllegalArgumentException if store is Store.NO + * @deprecated Use {@link #Field(String, byte[], int, int) instead} + */ + @Deprecated + public Field(String name, byte[] value, int offset, int length, Store store) { + this(name, value, offset, length); + + if (store == Store.NO) { + throw new IllegalArgumentException("binary values can't be unstored"); + } + } + + /** + * Create a stored field with binary value. Optionally the value may be compressed. + * + * @param name The name of the field + * @param value The binary value + * @param offset Starting offset in value where this Field's bytes are + * @param length Number of bytes to use for this Field, starting at offset + */ + public Field(String name, byte[] value, int offset, int length) { + + if (name == null) + throw new IllegalArgumentException("name cannot be null"); + if (value == null) + throw new IllegalArgumentException("value cannot be null"); + + this.name = StringHelper.intern(name); // field names are interned + fieldsData = value; + + isStored = true; + isIndexed = false; + isTokenized = false; + indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; + omitNorms = true; + + isBinary = true; + binaryLength = length; + binaryOffset = offset; + + setStoreTermVector(TermVector.NO); + } +}