1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.Closeable;
21 import java.io.IOException;
22 import java.io.Reader;
23 import java.util.zip.DataFormatException;
25 import org.apache.lucene.analysis.TokenStream;
26 import org.apache.lucene.document.AbstractField;
27 import org.apache.lucene.document.CompressionTools;
28 import org.apache.lucene.document.Document;
29 import org.apache.lucene.document.Field;
30 import org.apache.lucene.document.FieldSelector;
31 import org.apache.lucene.document.FieldSelectorResult;
32 import org.apache.lucene.document.Fieldable;
33 import org.apache.lucene.document.NumericField;
34 import org.apache.lucene.store.AlreadyClosedException;
35 import org.apache.lucene.store.BufferedIndexInput;
36 import org.apache.lucene.store.Directory;
37 import org.apache.lucene.store.IndexInput;
38 import org.apache.lucene.util.CloseableThreadLocal;
39 import org.apache.lucene.util.IOUtils;
42 * Class responsible for access to stored document fields.
44 * It uses <segment>.fdt and <segment>.fdx; files.
46 final class FieldsReader implements Cloneable, Closeable {
47 private final FieldInfos fieldInfos;
49 // The main fieldStream, used only for cloning.
50 private final IndexInput cloneableFieldsStream;
52 // This is a clone of cloneableFieldsStream used for reading documents.
53 // It should not be cloned outside of a synchronized context.
54 private final IndexInput fieldsStream;
56 private final IndexInput cloneableIndexStream;
57 private final IndexInput indexStream;
58 private int numTotalDocs;
60 private boolean closed;
61 private final int format;
62 private final int formatSize;
64 // The docID offset where our docs begin in the index
65 // file. This will be 0 if we have our own private file.
66 private int docStoreOffset;
68 private CloseableThreadLocal<IndexInput> fieldsStreamTL = new CloseableThreadLocal<IndexInput>();
69 private boolean isOriginal = false;
71 /** Returns a cloned FieldsReader that shares open
72 * IndexInputs with the original one. It is the caller's
73 * job not to close the original FieldsReader until all
74 * clones are called (eg, currently SegmentReader manages
77 public Object clone() {
79 return new FieldsReader(fieldInfos, numTotalDocs, size, format, formatSize, docStoreOffset, cloneableFieldsStream, cloneableIndexStream);
83 * Detects the code version this segment was written with. Returns either
84 * "2.x" for all pre-3.0 segments, or "3.0" for 3.0 segments. This method
85 * should not be called for 3.1+ segments since they already record their code
88 static String detectCodeVersion(Directory dir, String segment) throws IOException {
89 IndexInput idxStream = dir.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.FIELDS_INDEX_EXTENSION), 1024);
91 int format = idxStream.readInt();
92 if (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) {
102 // Used only by clone
103 private FieldsReader(FieldInfos fieldInfos, int numTotalDocs, int size, int format, int formatSize,
104 int docStoreOffset, IndexInput cloneableFieldsStream, IndexInput cloneableIndexStream) {
105 this.fieldInfos = fieldInfos;
106 this.numTotalDocs = numTotalDocs;
108 this.format = format;
109 this.formatSize = formatSize;
110 this.docStoreOffset = docStoreOffset;
111 this.cloneableFieldsStream = cloneableFieldsStream;
112 this.cloneableIndexStream = cloneableIndexStream;
113 fieldsStream = (IndexInput) cloneableFieldsStream.clone();
114 indexStream = (IndexInput) cloneableIndexStream.clone();
117 FieldsReader(Directory d, String segment, FieldInfos fn) throws IOException {
118 this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE, -1, 0);
121 FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize) throws IOException {
122 this(d, segment, fn, readBufferSize, -1, 0);
125 FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize, int docStoreOffset, int size) throws IOException {
126 boolean success = false;
131 cloneableFieldsStream = d.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.FIELDS_EXTENSION), readBufferSize);
132 cloneableIndexStream = d.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.FIELDS_INDEX_EXTENSION), readBufferSize);
134 // First version of fdx did not include a format
135 // header, but, the first int will always be 0 in that
137 int firstInt = cloneableIndexStream.readInt();
143 if (format > FieldsWriter.FORMAT_CURRENT)
144 throw new CorruptIndexException("Incompatible format version: " + format + " expected "
145 + FieldsWriter.FORMAT_CURRENT + " or lower");
147 if (format > FieldsWriter.FORMAT)
152 if (format < FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES)
153 cloneableFieldsStream.setModifiedUTF8StringsMode();
155 fieldsStream = (IndexInput) cloneableFieldsStream.clone();
157 final long indexSize = cloneableIndexStream.length()-formatSize;
159 if (docStoreOffset != -1) {
160 // We read only a slice out of this shared fields file
161 this.docStoreOffset = docStoreOffset;
164 // Verify the file is long enough to hold all of our
166 assert ((int) (indexSize / 8)) >= size + this.docStoreOffset: "indexSize=" + indexSize + " size=" + size + " docStoreOffset=" + docStoreOffset;
168 this.docStoreOffset = 0;
169 this.size = (int) (indexSize >> 3);
172 indexStream = (IndexInput) cloneableIndexStream.clone();
173 numTotalDocs = (int) (indexSize >> 3);
176 // With lock-less commits, it's entirely possible (and
177 // fine) to hit a FileNotFound exception above. In
178 // this case, we want to explicitly close any subset
179 // of things that were opened so that we don't have to
180 // wait for a GC to do so.
188 * @throws AlreadyClosedException if this FieldsReader is closed
190 private void ensureOpen() throws AlreadyClosedException {
192 throw new AlreadyClosedException("this FieldsReader is closed");
197 * Closes the underlying {@link org.apache.lucene.store.IndexInput} streams, including any ones associated with a
198 * lazy implementation of a Field. This means that the Fields values will not be accessible.
200 * @throws IOException
202 public final void close() throws IOException {
205 IOUtils.close(fieldsStream, indexStream, fieldsStreamTL, cloneableFieldsStream, cloneableIndexStream);
207 IOUtils.close(fieldsStream, indexStream, fieldsStreamTL);
217 private final void seekIndex(int docID) throws IOException {
218 indexStream.seek(formatSize + (docID + docStoreOffset) * 8L);
221 boolean canReadRawDocs() {
222 // Disable reading raw docs in 2.x format, because of the removal of compressed
223 // fields in 3.0. We don't want rawDocs() to decode field bits to figure out
224 // if a field was compressed, hence we enforce ordinary (non-raw) stored field merges
226 return format >= FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS;
229 final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
231 long position = indexStream.readLong();
232 fieldsStream.seek(position);
234 Document doc = new Document();
235 int numFields = fieldsStream.readVInt();
236 out: for (int i = 0; i < numFields; i++) {
237 int fieldNumber = fieldsStream.readVInt();
238 FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
239 FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name);
241 int bits = fieldsStream.readByte() & 0xFF;
242 assert bits <= (FieldsWriter.FIELD_IS_NUMERIC_MASK | FieldsWriter.FIELD_IS_COMPRESSED | FieldsWriter.FIELD_IS_TOKENIZED | FieldsWriter.FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits);
244 boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
245 assert (compressed ? (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) : true)
246 : "compressed fields are only allowed in indexes of version <= 2.9";
247 boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
248 boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0;
249 final int numeric = bits & FieldsWriter.FIELD_IS_NUMERIC_MASK;
251 switch (acceptField) {
253 addField(doc, fi, binary, compressed, tokenize, numeric);
256 addField(doc, fi, binary, compressed, tokenize, numeric);
257 break out; //Get out of this loop
259 addFieldLazy(doc, fi, binary, compressed, tokenize, true, numeric);
262 addFieldLazy(doc, fi, binary, compressed, tokenize, false, numeric);
265 skipFieldBytes(binary, compressed, addFieldSize(doc, fi, binary, compressed, numeric));
268 addFieldSize(doc, fi, binary, compressed, numeric);
269 break out; //Get out of this loop
271 skipField(binary, compressed, numeric);
278 /** Returns the length in bytes of each raw document in a
279 * contiguous range of length numDocs starting with
280 * startDocID. Returns the IndexInput (the fieldStream),
281 * already seeked to the starting point for startDocID.*/
282 final IndexInput rawDocs(int[] lengths, int startDocID, int numDocs) throws IOException {
283 seekIndex(startDocID);
284 long startOffset = indexStream.readLong();
285 long lastOffset = startOffset;
287 while (count < numDocs) {
289 final int docID = docStoreOffset + startDocID + count + 1;
290 assert docID <= numTotalDocs;
291 if (docID < numTotalDocs)
292 offset = indexStream.readLong();
294 offset = fieldsStream.length();
295 lengths[count++] = (int) (offset-lastOffset);
299 fieldsStream.seek(startOffset);
305 * Skip the field. We still have to read some of the information about the field, but can skip past the actual content.
306 * This will have the most payoff on large fields.
308 private void skipField(boolean binary, boolean compressed, int numeric) throws IOException {
312 numBytes = fieldsStream.readVInt();
314 case FieldsWriter.FIELD_IS_NUMERIC_INT:
315 case FieldsWriter.FIELD_IS_NUMERIC_FLOAT:
318 case FieldsWriter.FIELD_IS_NUMERIC_LONG:
319 case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE:
323 throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric));
326 skipFieldBytes(binary, compressed, numBytes);
329 private void skipFieldBytes(boolean binary, boolean compressed, int toRead) throws IOException {
330 if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) {
331 fieldsStream.seek(fieldsStream.getFilePointer() + toRead);
333 // We need to skip chars. This will slow us down, but still better
334 fieldsStream.skipChars(toRead);
338 private NumericField loadNumericField(FieldInfo fi, int numeric) throws IOException {
341 case FieldsWriter.FIELD_IS_NUMERIC_INT:
342 return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setIntValue(fieldsStream.readInt());
343 case FieldsWriter.FIELD_IS_NUMERIC_LONG:
344 return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setLongValue(fieldsStream.readLong());
345 case FieldsWriter.FIELD_IS_NUMERIC_FLOAT:
346 return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setFloatValue(Float.intBitsToFloat(fieldsStream.readInt()));
347 case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE:
348 return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setDoubleValue(Double.longBitsToDouble(fieldsStream.readLong()));
350 throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric));
354 private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, boolean cacheResult, int numeric) throws IOException {
355 final AbstractField f;
357 int toRead = fieldsStream.readVInt();
358 long pointer = fieldsStream.getFilePointer();
359 f = new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, compressed, cacheResult);
360 //Need to move the pointer ahead by toRead positions
361 fieldsStream.seek(pointer + toRead);
362 } else if (numeric != 0) {
363 f = loadNumericField(fi, numeric);
365 Field.Store store = Field.Store.YES;
366 Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize);
367 Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector);
370 int toRead = fieldsStream.readVInt();
371 long pointer = fieldsStream.getFilePointer();
372 f = new LazyField(fi.name, store, toRead, pointer, binary, compressed, cacheResult);
373 //skip over the part that we aren't loading
374 fieldsStream.seek(pointer + toRead);
376 int length = fieldsStream.readVInt();
377 long pointer = fieldsStream.getFilePointer();
378 //Skip ahead of where we are by the length of what is stored
379 if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
380 fieldsStream.seek(pointer+length);
382 fieldsStream.skipChars(length);
384 f = new LazyField(fi.name, store, index, termVector, length, pointer, binary, compressed, cacheResult);
388 f.setOmitNorms(fi.omitNorms);
389 f.setIndexOptions(fi.indexOptions);
393 private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, int numeric) throws CorruptIndexException, IOException {
394 final AbstractField f;
396 //we have a binary stored field, and it may be compressed
398 int toRead = fieldsStream.readVInt();
399 final byte[] b = new byte[toRead];
400 fieldsStream.readBytes(b, 0, b.length);
402 f = new Field(fi.name, uncompress(b));
404 f = new Field(fi.name, b);
406 } else if (numeric != 0) {
407 f = loadNumericField(fi, numeric);
409 Field.Store store = Field.Store.YES;
410 Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize);
411 Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector);
413 int toRead = fieldsStream.readVInt();
414 final byte[] b = new byte[toRead];
415 fieldsStream.readBytes(b, 0, b.length);
416 f = new Field(fi.name, // field name
418 new String(uncompress(b), "UTF-8"), // uncompress the value and add as string
423 f = new Field(fi.name, // name
425 fieldsStream.readString(), // read value
432 f.setIndexOptions(fi.indexOptions);
433 f.setOmitNorms(fi.omitNorms);
437 // Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes)
438 // Read just the size -- caller must skip the field content to continue reading fields
439 // Return the size in bytes or chars, depending on field type
440 private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed, int numeric) throws IOException {
441 final int bytesize, size;
444 size = fieldsStream.readVInt();
445 bytesize = (binary || compressed) ? size : 2*size;
447 case FieldsWriter.FIELD_IS_NUMERIC_INT:
448 case FieldsWriter.FIELD_IS_NUMERIC_FLOAT:
451 case FieldsWriter.FIELD_IS_NUMERIC_LONG:
452 case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE:
456 throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric));
458 byte[] sizebytes = new byte[4];
459 sizebytes[0] = (byte) (bytesize>>>24);
460 sizebytes[1] = (byte) (bytesize>>>16);
461 sizebytes[2] = (byte) (bytesize>>> 8);
462 sizebytes[3] = (byte) bytesize ;
463 doc.add(new Field(fi.name, sizebytes));
468 * A Lazy implementation of Fieldable that defers loading of fields until asked for, instead of when the Document is
471 private class LazyField extends AbstractField implements Fieldable {
473 private long pointer;
474 /** @deprecated Only kept for backward-compatbility with <3.0 indexes. Will be removed in 4.0. */
476 private boolean isCompressed;
477 private boolean cacheResult;
479 public LazyField(String name, Field.Store store, int toRead, long pointer, boolean isBinary, boolean isCompressed, boolean cacheResult) {
480 super(name, store, Field.Index.NO, Field.TermVector.NO);
481 this.toRead = toRead;
482 this.pointer = pointer;
483 this.isBinary = isBinary;
484 this.cacheResult = cacheResult;
486 binaryLength = toRead;
488 this.isCompressed = isCompressed;
491 public LazyField(String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer, boolean isBinary, boolean isCompressed, boolean cacheResult) {
492 super(name, store, index, termVector);
493 this.toRead = toRead;
494 this.pointer = pointer;
495 this.isBinary = isBinary;
496 this.cacheResult = cacheResult;
498 binaryLength = toRead;
500 this.isCompressed = isCompressed;
503 private IndexInput getFieldStream() {
504 IndexInput localFieldsStream = fieldsStreamTL.get();
505 if (localFieldsStream == null) {
506 localFieldsStream = (IndexInput) cloneableFieldsStream.clone();
507 fieldsStreamTL.set(localFieldsStream);
509 return localFieldsStream;
512 /** The value of the field as a Reader, or null. If null, the String value,
513 * binary value, or TokenStream value is used. Exactly one of stringValue(),
514 * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
515 public Reader readerValue() {
520 /** The value of the field as a TokenStream, or null. If null, the Reader value,
521 * String value, or binary value is used. Exactly one of stringValue(),
522 * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
523 public TokenStream tokenStreamValue() {
528 /** The value of the field as a String, or null. If null, the Reader value,
529 * binary value, or TokenStream value is used. Exactly one of stringValue(),
530 * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
531 public String stringValue() {
536 if (fieldsData == null) {
537 IndexInput localFieldsStream = getFieldStream();
540 localFieldsStream.seek(pointer);
542 final byte[] b = new byte[toRead];
543 localFieldsStream.readBytes(b, 0, b.length);
544 value = new String(uncompress(b), "UTF-8");
546 if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
547 byte[] bytes = new byte[toRead];
548 localFieldsStream.readBytes(bytes, 0, toRead);
549 value = new String(bytes, "UTF-8");
551 //read in chars b/c we already know the length we need to read
552 char[] chars = new char[toRead];
553 localFieldsStream.readChars(chars, 0, toRead);
554 value = new String(chars);
557 } catch (IOException e) {
558 throw new FieldReaderException(e);
565 return (String) fieldsData;
571 public long getPointer() {
576 public void setPointer(long pointer) {
578 this.pointer = pointer;
581 public int getToRead() {
586 public void setToRead(int toRead) {
588 this.toRead = toRead;
592 public byte[] getBinaryValue(byte[] result) {
596 if (fieldsData == null) {
597 // Allocate new buffer if result is null or too small
600 if (result == null || result.length < toRead)
601 b = new byte[toRead];
605 IndexInput localFieldsStream = getFieldStream();
607 // Throw this IOException since IndexReader.document does so anyway, so probably not that big of a change for people
608 // since they are already handling this exception when getting the document
610 localFieldsStream.seek(pointer);
611 localFieldsStream.readBytes(b, 0, toRead);
612 if (isCompressed == true) {
613 value = uncompress(b);
617 } catch (IOException e) {
618 throw new FieldReaderException(e);
622 binaryLength = toRead;
623 if (cacheResult == true){
628 return (byte[]) fieldsData;
636 private byte[] uncompress(byte[] b)
637 throws CorruptIndexException {
639 return CompressionTools.decompress(b);
640 } catch (DataFormatException e) {
641 // this will happen if the field is not compressed
642 CorruptIndexException newException = new CorruptIndexException("field data are in wrong format: " + e.toString());
643 newException.initCause(e);