1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.document.Document;
21 import org.apache.lucene.document.FieldSelector;
22 import org.apache.lucene.search.FieldCache; // javadocs
23 import org.apache.lucene.search.Similarity;
24 import org.apache.lucene.store.*;
25 import org.apache.lucene.util.ArrayUtil;
28 import java.io.FileOutputStream;
29 import java.io.IOException;
30 import java.io.Closeable;
31 import java.util.Collection;
33 import java.util.concurrent.atomic.AtomicInteger;
35 /** IndexReader is an abstract class, providing an interface for accessing an
36 index. Search of an index is done entirely through this abstract interface,
37 so that any subclass which implements it is searchable.
39 <p> Concrete subclasses of IndexReader are usually constructed with a call to
40 one of the static <code>open()</code> methods, e.g. {@link
41 #open(Directory, boolean)}.
43 <p> For efficiency, in this API documents are often referred to via
44 <i>document numbers</i>, non-negative integers which each name a unique
45 document in the index. These document numbers are ephemeral--they may change
46 as documents are added to and deleted from an index. Clients should thus not
47 rely on a given document having the same number between sessions.
49 <p> An IndexReader can be opened on a directory for which an IndexWriter is
50 opened already, but it cannot be used to delete documents from the index then.
53 <b>NOTE</b>: for backwards API compatibility, several methods are not listed
54 as abstract, but have no useful implementations in this base class and
55 instead always throw UnsupportedOperationException. Subclasses are
56 strongly encouraged to override these methods, but in many cases may not
62 <b>NOTE</b>: as of 2.4, it's possible to open a read-only
63 IndexReader using the static open methods that accept the
64 boolean readOnly parameter. Such a reader has better
65 concurrency as it's not necessary to synchronize on the
66 isDeleted method. You must specify false if you want to
67 make changes with the resulting IndexReader.
70 <a name="thread-safety"></a><p><b>NOTE</b>: {@link
71 IndexReader} instances are completely thread
72 safe, meaning multiple threads can call any of its methods,
73 concurrently. If your application requires external
74 synchronization, you should <b>not</b> synchronize on the
75 <code>IndexReader</code> instance; use your own
76 (non-Lucene) objects instead.
78 public abstract class IndexReader implements Cloneable,Closeable {
81 * A custom listener that's invoked when the IndexReader
84 * <p>For a SegmentReader, this listener is called only
85 * once all SegmentReaders sharing the same core are
86 * closed. At this point it is safe for apps to evict
87 * this reader from any caches keyed on {@link
88 * #getCoreCacheKey}. This is the same interface that
89 * {@link FieldCache} uses, internally, to evict
92 * <p>For other readers, this listener is called when they
95 * @lucene.experimental
97 public static interface ReaderFinishedListener {
98 public void finished(IndexReader reader);
101 // Impls must set this if they may call add/removeReaderFinishedListener:
102 protected volatile Collection<ReaderFinishedListener> readerFinishedListeners;
104 /** Expert: adds a {@link ReaderFinishedListener}. The
105 * provided listener is also added to any sub-readers, if
106 * this is a composite reader. Also, any reader reopened
107 * or cloned from this one will also copy the listeners at
108 * the time of reopen.
110 * @lucene.experimental */
111 public void addReaderFinishedListener(ReaderFinishedListener listener) {
112 readerFinishedListeners.add(listener);
115 /** Expert: remove a previously added {@link ReaderFinishedListener}.
117 * @lucene.experimental */
118 public void removeReaderFinishedListener(ReaderFinishedListener listener) {
119 readerFinishedListeners.remove(listener);
122 protected void notifyReaderFinishedListeners() {
123 // Defensive (should never be null -- all impls must set
125 if (readerFinishedListeners != null) {
126 for(ReaderFinishedListener listener : readerFinishedListeners) {
127 listener.finished(this);
132 protected void readerFinished() {
133 notifyReaderFinishedListeners();
137 * Constants describing field properties, for example used for
138 * {@link IndexReader#getFieldNames(FieldOption)}.
140 public static enum FieldOption {
143 /** All indexed fields */
145 /** All fields that store payloads */
147 /** All fields that omit tf */
148 OMIT_TERM_FREQ_AND_POSITIONS,
149 /** All fields that omit positions */
151 /** All fields which are not indexed */
153 /** All fields which are indexed with termvectors enabled */
154 INDEXED_WITH_TERMVECTOR,
155 /** All fields which are indexed but don't have termvectors enabled */
156 INDEXED_NO_TERMVECTOR,
157 /** All fields with termvectors enabled. Please note that only standard termvector fields are returned */
159 /** All fields with termvectors with position values enabled */
160 TERMVECTOR_WITH_POSITION,
161 /** All fields with termvectors with offset values enabled */
162 TERMVECTOR_WITH_OFFSET,
163 /** All fields with termvectors with offset values and position values enabled */
164 TERMVECTOR_WITH_POSITION_OFFSET,
167 private boolean closed;
168 protected boolean hasChanges;
170 private final AtomicInteger refCount = new AtomicInteger();
172 static int DEFAULT_TERMS_INDEX_DIVISOR = 1;
174 /** Expert: returns the current refCount for this reader */
175 public int getRefCount() {
176 return refCount.get();
180 * Expert: increments the refCount of this IndexReader
181 * instance. RefCounts are used to determine when a
182 * reader can be closed safely, i.e. as soon as there are
183 * no more references. Be sure to always call a
184 * corresponding {@link #decRef}, in a finally clause;
185 * otherwise the reader may never be closed. Note that
186 * {@link #close} simply calls decRef(), which means that
187 * the IndexReader will not really be closed until {@link
188 * #decRef} has been called for all outstanding
193 public void incRef() {
195 refCount.incrementAndGet();
200 public String toString() {
201 final StringBuilder buffer = new StringBuilder();
205 buffer.append(getClass().getSimpleName());
207 final IndexReader[] subReaders = getSequentialSubReaders();
208 if ((subReaders != null) && (subReaders.length > 0)) {
209 buffer.append(subReaders[0]);
210 for (int i = 1; i < subReaders.length; ++i) {
211 buffer.append(" ").append(subReaders[i]);
215 return buffer.toString();
219 * Expert: decreases the refCount of this IndexReader
220 * instance. If the refCount drops to 0, then pending
221 * changes (if any) are committed to the index and this
222 * reader is closed. If an exception is hit, the refCount
225 * @throws IOException in case an IOException occurs in commit() or doClose()
229 public void decRef() throws IOException {
231 if (refCount.getAndDecrement() == 1) {
232 boolean success = false;
239 // Put reference back on failure
240 refCount.incrementAndGet();
247 protected IndexReader() {
252 * @throws AlreadyClosedException if this IndexReader is closed
254 protected final void ensureOpen() throws AlreadyClosedException {
255 if (refCount.get() <= 0) {
256 throw new AlreadyClosedException("this IndexReader is closed");
260 /** Returns a IndexReader reading the index in the given
261 * Directory, with readOnly=true.
262 * @param directory the index directory
263 * @throws CorruptIndexException if the index is corrupt
264 * @throws IOException if there is a low-level IO error
266 public static IndexReader open(final Directory directory) throws CorruptIndexException, IOException {
267 return open(directory, null, null, true, DEFAULT_TERMS_INDEX_DIVISOR);
270 /** Returns an IndexReader reading the index in the given
271 * Directory. You should pass readOnly=true, since it
272 * gives much better concurrent performance, unless you
273 * intend to do write operations (delete documents or
274 * change norms) with the reader.
275 * @param directory the index directory
276 * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader
277 * @throws CorruptIndexException if the index is corrupt
278 * @throws IOException if there is a low-level IO error
280 public static IndexReader open(final Directory directory, boolean readOnly) throws CorruptIndexException, IOException {
281 return open(directory, null, null, readOnly, DEFAULT_TERMS_INDEX_DIVISOR);
285 * Open a near real time IndexReader from the {@link org.apache.lucene.index.IndexWriter}.
287 * @param writer The IndexWriter to open from
288 * @param applyAllDeletes If true, all buffered deletes will
289 * be applied (made visible) in the returned reader. If
290 * false, the deletes are not applied but remain buffered
291 * (in IndexWriter) so that they will be applied in the
292 * future. Applying deletes can be costly, so if your app
293 * can tolerate deleted documents being returned you might
294 * gain some performance by passing false.
295 * @return The new IndexReader
296 * @throws CorruptIndexException
297 * @throws IOException if there is a low-level IO error
299 * @see #reopen(IndexWriter,boolean)
301 * @lucene.experimental
303 public static IndexReader open(final IndexWriter writer, boolean applyAllDeletes) throws CorruptIndexException, IOException {
304 return writer.getReader(applyAllDeletes);
307 /** Expert: returns an IndexReader reading the index in the given
308 * {@link IndexCommit}. You should pass readOnly=true, since it
309 * gives much better concurrent performance, unless you
310 * intend to do write operations (delete documents or
311 * change norms) with the reader.
312 * @param commit the commit point to open
313 * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader
314 * @throws CorruptIndexException if the index is corrupt
315 * @throws IOException if there is a low-level IO error
317 public static IndexReader open(final IndexCommit commit, boolean readOnly) throws CorruptIndexException, IOException {
318 return open(commit.getDirectory(), null, commit, readOnly, DEFAULT_TERMS_INDEX_DIVISOR);
321 /** Expert: returns an IndexReader reading the index in
322 * the given Directory, with a custom {@link
323 * IndexDeletionPolicy}. You should pass readOnly=true,
324 * since it gives much better concurrent performance,
325 * unless you intend to do write operations (delete
326 * documents or change norms) with the reader.
327 * @param directory the index directory
328 * @param deletionPolicy a custom deletion policy (only used
329 * if you use this reader to perform deletes or to set
330 * norms); see {@link IndexWriter} for details.
331 * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader
332 * @throws CorruptIndexException if the index is corrupt
333 * @throws IOException if there is a low-level IO error
335 public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy, boolean readOnly) throws CorruptIndexException, IOException {
336 return open(directory, deletionPolicy, null, readOnly, DEFAULT_TERMS_INDEX_DIVISOR);
339 /** Expert: returns an IndexReader reading the index in
340 * the given Directory, with a custom {@link
341 * IndexDeletionPolicy}. You should pass readOnly=true,
342 * since it gives much better concurrent performance,
343 * unless you intend to do write operations (delete
344 * documents or change norms) with the reader.
345 * @param directory the index directory
346 * @param deletionPolicy a custom deletion policy (only used
347 * if you use this reader to perform deletes or to set
348 * norms); see {@link IndexWriter} for details.
349 * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader
350 * @param termInfosIndexDivisor Subsamples which indexed
351 * terms are loaded into RAM. This has the same effect as {@link
352 * IndexWriter#setTermIndexInterval} except that setting
353 * must be done at indexing time while this setting can be
354 * set per reader. When set to N, then one in every
355 * N*termIndexInterval terms in the index is loaded into
356 * memory. By setting this to a value > 1 you can reduce
357 * memory usage, at the expense of higher latency when
358 * loading a TermInfo. The default value is 1. Set this
359 * to -1 to skip loading the terms index entirely.
360 * @throws CorruptIndexException if the index is corrupt
361 * @throws IOException if there is a low-level IO error
363 public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor) throws CorruptIndexException, IOException {
364 return open(directory, deletionPolicy, null, readOnly, termInfosIndexDivisor);
367 /** Expert: returns an IndexReader reading the index in
368 * the given Directory, using a specific commit and with
369 * a custom {@link IndexDeletionPolicy}. You should pass
370 * readOnly=true, since it gives much better concurrent
371 * performance, unless you intend to do write operations
372 * (delete documents or change norms) with the reader.
373 * @param commit the specific {@link IndexCommit} to open;
374 * see {@link IndexReader#listCommits} to list all commits
376 * @param deletionPolicy a custom deletion policy (only used
377 * if you use this reader to perform deletes or to set
378 * norms); see {@link IndexWriter} for details.
379 * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader
380 * @throws CorruptIndexException if the index is corrupt
381 * @throws IOException if there is a low-level IO error
383 public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy, boolean readOnly) throws CorruptIndexException, IOException {
384 return open(commit.getDirectory(), deletionPolicy, commit, readOnly, DEFAULT_TERMS_INDEX_DIVISOR);
387 /** Expert: returns an IndexReader reading the index in
388 * the given Directory, using a specific commit and with
389 * a custom {@link IndexDeletionPolicy}. You should pass
390 * readOnly=true, since it gives much better concurrent
391 * performance, unless you intend to do write operations
392 * (delete documents or change norms) with the reader.
393 * @param commit the specific {@link IndexCommit} to open;
394 * see {@link IndexReader#listCommits} to list all commits
396 * @param deletionPolicy a custom deletion policy (only used
397 * if you use this reader to perform deletes or to set
398 * norms); see {@link IndexWriter} for details.
399 * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader
400 * @param termInfosIndexDivisor Subsamples which indexed
401 * terms are loaded into RAM. This has the same effect as {@link
402 * IndexWriter#setTermIndexInterval} except that setting
403 * must be done at indexing time while this setting can be
404 * set per reader. When set to N, then one in every
405 * N*termIndexInterval terms in the index is loaded into
406 * memory. By setting this to a value > 1 you can reduce
407 * memory usage, at the expense of higher latency when
408 * loading a TermInfo. The default value is 1. Set this
409 * to -1 to skip loading the terms index entirely. This is only useful in
410 * advanced situations when you will only .next() through all terms;
411 * attempts to seek will hit an exception.
413 * @throws CorruptIndexException if the index is corrupt
414 * @throws IOException if there is a low-level IO error
416 public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor) throws CorruptIndexException, IOException {
417 return open(commit.getDirectory(), deletionPolicy, commit, readOnly, termInfosIndexDivisor);
420 private static IndexReader open(final Directory directory, final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, final boolean readOnly, int termInfosIndexDivisor) throws CorruptIndexException, IOException {
421 return DirectoryReader.open(directory, deletionPolicy, commit, readOnly, termInfosIndexDivisor);
425 * Refreshes an IndexReader if the index has changed since this instance
428 * Opening an IndexReader is an expensive operation. This method can be used
429 * to refresh an existing IndexReader to reduce these costs. This method
430 * tries to only load segments that have changed or were created after the
431 * IndexReader was (re)opened.
433 * If the index has not changed since this instance was (re)opened, then this
434 * call is a NOOP and returns this instance. Otherwise, a new instance is
435 * returned. The old instance is <b>not</b> closed and remains usable.<br>
437 * If the reader is reopened, even though they share
438 * resources internally, it's safe to make changes
439 * (deletions, norms) with the new reader. All shared
440 * mutable state obeys "copy on write" semantics to ensure
441 * the changes are not seen by other readers.
443 * You can determine whether a reader was actually reopened by comparing the
444 * old instance with the instance returned by this method:
446 * IndexReader reader = ...
448 * IndexReader newReader = r.reopen();
449 * if (newReader != reader) {
450 * ... // reader was reopened
453 * reader = newReader;
457 * Be sure to synchronize that code so that other threads,
458 * if present, can never use reader after it has been
459 * closed and before it's switched to newReader.
461 * <p><b>NOTE</b>: If this reader is a near real-time
462 * reader (obtained from {@link IndexWriter#getReader()},
463 * reopen() will simply call writer.getReader() again for
464 * you, though this may change in the future.
466 * @throws CorruptIndexException if the index is corrupt
467 * @throws IOException if there is a low-level IO error
469 public synchronized IndexReader reopen() throws CorruptIndexException, IOException {
470 throw new UnsupportedOperationException("This reader does not support reopen().");
474 /** Just like {@link #reopen()}, except you can change the
475 * readOnly of the original reader. If the index is
476 * unchanged but readOnly is different then a new reader
477 * will be returned. */
478 public synchronized IndexReader reopen(boolean openReadOnly) throws CorruptIndexException, IOException {
479 throw new UnsupportedOperationException("This reader does not support reopen().");
482 /** Expert: reopen this reader on a specific commit point.
483 * This always returns a readOnly reader. If the
484 * specified commit point matches what this reader is
485 * already on, and this reader is already readOnly, then
486 * this same instance is returned; if it is not already
487 * readOnly, a readOnly clone is returned. */
488 public synchronized IndexReader reopen(final IndexCommit commit) throws CorruptIndexException, IOException {
489 throw new UnsupportedOperationException("This reader does not support reopen(IndexCommit).");
493 * Expert: returns a readonly reader, covering all
494 * committed as well as un-committed changes to the index.
495 * This provides "near real-time" searching, in that
496 * changes made during an IndexWriter session can be
497 * quickly made available for searching without closing
498 * the writer nor calling {@link #commit}.
500 * <p>Note that this is functionally equivalent to calling
501 * {#flush} (an internal IndexWriter operation) and then using {@link IndexReader#open} to
502 * open a new reader. But the turnaround time of this
503 * method should be faster since it avoids the potentially
504 * costly {@link #commit}.</p>
506 * <p>You must close the {@link IndexReader} returned by
507 * this method once you are done using it.</p>
509 * <p>It's <i>near</i> real-time because there is no hard
510 * guarantee on how quickly you can get a new reader after
511 * making changes with IndexWriter. You'll have to
512 * experiment in your situation to determine if it's
513 * fast enough. As this is a new and experimental
514 * feature, please report back on your findings so we can
515 * learn, improve and iterate.</p>
517 * <p>The resulting reader supports {@link
518 * IndexReader#reopen}, but that call will simply forward
519 * back to this method (though this may change in the
522 * <p>The very first time this method is called, this
523 * writer instance will make every effort to pool the
524 * readers that it opens for doing merges, applying
525 * deletes, etc. This means additional resources (RAM,
526 * file descriptors, CPU time) will be consumed.</p>
528 * <p>For lower latency on reopening a reader, you should
529 * call {@link IndexWriterConfig#setMergedSegmentWarmer} to
530 * pre-warm a newly merged segment before it's committed
531 * to the index. This is important for minimizing
532 * index-to-search delay after a large merge. </p>
534 * <p>If an addIndexes* call is running in another thread,
535 * then this reader will only search those segments from
536 * the foreign index that have been successfully copied
539 * <p><b>NOTE</b>: Once the writer is closed, any
540 * outstanding readers may continue to be used. However,
541 * if you attempt to reopen any of those readers, you'll
542 * hit an {@link AlreadyClosedException}.</p>
544 * @return IndexReader that covers entire index plus all
545 * changes made so far by this IndexWriter instance
547 * @param writer The IndexWriter to open from
548 * @param applyAllDeletes If true, all buffered deletes will
549 * be applied (made visible) in the returned reader. If
550 * false, the deletes are not applied but remain buffered
551 * (in IndexWriter) so that they will be applied in the
552 * future. Applying deletes can be costly, so if your app
553 * can tolerate deleted documents being returned you might
554 * gain some performance by passing false.
556 * @throws IOException
558 * @lucene.experimental
560 public IndexReader reopen(IndexWriter writer, boolean applyAllDeletes) throws CorruptIndexException, IOException {
561 return writer.getReader(applyAllDeletes);
565 * Efficiently clones the IndexReader (sharing most
568 * On cloning a reader with pending changes (deletions,
569 * norms), the original reader transfers its write lock to
570 * the cloned reader. This means only the cloned reader
571 * may make further changes to the index, and commit the
572 * changes to the index on close, but the old reader still
573 * reflects all changes made up until it was cloned.
575 * Like {@link #reopen()}, it's safe to make changes to
576 * either the original or the cloned reader: all shared
577 * mutable state obeys "copy on write" semantics to ensure
578 * the changes are not seen by other readers.
582 public synchronized Object clone() {
583 throw new UnsupportedOperationException("This reader does not implement clone()");
587 * Clones the IndexReader and optionally changes readOnly. A readOnly
588 * reader cannot open a writeable reader.
589 * @throws CorruptIndexException if the index is corrupt
590 * @throws IOException if there is a low-level IO error
592 public synchronized IndexReader clone(boolean openReadOnly) throws CorruptIndexException, IOException {
593 throw new UnsupportedOperationException("This reader does not implement clone()");
597 * Returns the directory associated with this index. The Default
598 * implementation returns the directory specified by subclasses when
599 * delegating to the IndexReader(Directory) constructor, or throws an
600 * UnsupportedOperationException if one was not specified.
601 * @throws UnsupportedOperationException if no directory
603 public Directory directory() {
605 throw new UnsupportedOperationException("This reader does not support this method.");
609 * Returns the time the index in the named directory was last modified.
610 * Do not use this to check whether the reader is still up-to-date, use
611 * {@link #isCurrent()} instead.
612 * @throws CorruptIndexException if the index is corrupt
613 * @throws IOException if there is a low-level IO error
615 public static long lastModified(final Directory directory2) throws CorruptIndexException, IOException {
616 return ((Long) new SegmentInfos.FindSegmentsFile(directory2) {
618 public Object doBody(String segmentFileName) throws IOException {
619 return Long.valueOf(directory2.fileModified(segmentFileName));
621 }.run()).longValue();
625 * Reads version number from segments files. The version number is
626 * initialized with a timestamp and then increased by one for each change of
629 * @param directory where the index resides.
630 * @return version number.
631 * @throws CorruptIndexException if the index is corrupt
632 * @throws IOException if there is a low-level IO error
634 public static long getCurrentVersion(Directory directory) throws CorruptIndexException, IOException {
635 return SegmentInfos.readCurrentVersion(directory);
639 * Reads commitUserData, previously passed to {@link
640 * IndexWriter#commit(Map)}, from current index
641 * segments file. This will return null if {@link
642 * IndexWriter#commit(Map)} has never been called for
645 * @param directory where the index resides.
646 * @return commit userData.
647 * @throws CorruptIndexException if the index is corrupt
648 * @throws IOException if there is a low-level IO error
650 * @see #getCommitUserData()
652 public static Map<String,String> getCommitUserData(Directory directory) throws CorruptIndexException, IOException {
653 return SegmentInfos.readCurrentUserData(directory);
657 * Version number when this IndexReader was opened. Not
658 * implemented in the IndexReader base class.
660 * <p>If this reader is based on a Directory (ie, was
661 * created by calling {@link #open}, or {@link #reopen} on
662 * a reader based on a Directory), then this method
663 * returns the version recorded in the commit that the
664 * reader opened. This version is advanced every time
665 * {@link IndexWriter#commit} is called.</p>
667 * <p>If instead this reader is a near real-time reader
668 * (ie, obtained by a call to {@link
669 * IndexWriter#getReader}, or by calling {@link #reopen}
670 * on a near real-time reader), then this method returns
671 * the version of the last commit done by the writer.
672 * Note that even as further changes are made with the
673 * writer, the version will not changed until a commit is
674 * completed. Thus, you should not rely on this method to
675 * determine when a near real-time reader should be
676 * opened. Use {@link #isCurrent} instead.</p>
678 * @throws UnsupportedOperationException unless overridden in subclass
680 public long getVersion() {
681 throw new UnsupportedOperationException("This reader does not support this method.");
685 * Retrieve the String userData optionally passed to
686 * IndexWriter#commit. This will return null if {@link
687 * IndexWriter#commit(Map)} has never been called for
690 * @see #getCommitUserData(Directory)
692 public Map<String,String> getCommitUserData() {
693 throw new UnsupportedOperationException("This reader does not support this method.");
698 * Check whether any new changes have occurred to the
699 * index since this reader was opened.
701 * <p>If this reader is based on a Directory (ie, was
702 * created by calling {@link #open}, or {@link #reopen} on
703 * a reader based on a Directory), then this method checks
704 * if any further commits (see {@link IndexWriter#commit}
705 * have occurred in that directory).</p>
707 * <p>If instead this reader is a near real-time reader
708 * (ie, obtained by a call to {@link
709 * IndexWriter#getReader}, or by calling {@link #reopen}
710 * on a near real-time reader), then this method checks if
711 * either a new commmit has occurred, or any new
712 * uncommitted changes have taken place via the writer.
713 * Note that even if the writer has only performed
714 * merging, this method will still return false.</p>
716 * <p>In any event, if this returns false, you should call
717 * {@link #reopen} to get a new reader that sees the
720 * @throws CorruptIndexException if the index is corrupt
721 * @throws IOException if there is a low-level IO error
722 * @throws UnsupportedOperationException unless overridden in subclass
724 public boolean isCurrent() throws CorruptIndexException, IOException {
725 throw new UnsupportedOperationException("This reader does not support this method.");
729 * Checks is the index is optimized (if it has a single segment and
730 * no deletions). Not implemented in the IndexReader base class.
731 * @return <code>true</code> if the index is optimized; <code>false</code> otherwise
732 * @throws UnsupportedOperationException unless overridden in subclass
734 public boolean isOptimized() {
735 throw new UnsupportedOperationException("This reader does not support this method.");
739 * Return an array of term frequency vectors for the specified document.
740 * The array contains a vector for each vectorized field in the document.
741 * Each vector contains terms and frequencies for all terms in a given vectorized field.
742 * If no such fields existed, the method returns null. The term vectors that are
743 * returned may either be of type {@link TermFreqVector}
744 * or of type {@link TermPositionVector} if
745 * positions or offsets have been stored.
747 * @param docNumber document for which term frequency vectors are returned
748 * @return array of term frequency vectors. May be null if no term vectors have been
749 * stored for the specified document.
750 * @throws IOException if index cannot be accessed
751 * @see org.apache.lucene.document.Field.TermVector
753 abstract public TermFreqVector[] getTermFreqVectors(int docNumber)
758 * Return a term frequency vector for the specified document and field. The
759 * returned vector contains terms and frequencies for the terms in
760 * the specified field of this document, if the field had the storeTermVector
761 * flag set. If termvectors had been stored with positions or offsets, a
762 * {@link TermPositionVector} is returned.
764 * @param docNumber document for which the term frequency vector is returned
765 * @param field field for which the term frequency vector is returned.
766 * @return term frequency vector May be null if field does not exist in the specified
767 * document or term vector was not stored.
768 * @throws IOException if index cannot be accessed
769 * @see org.apache.lucene.document.Field.TermVector
771 abstract public TermFreqVector getTermFreqVector(int docNumber, String field)
775 * Load the Term Vector into a user-defined data structure instead of relying on the parallel arrays of
776 * the {@link TermFreqVector}.
777 * @param docNumber The number of the document to load the vector for
778 * @param field The name of the field to load
779 * @param mapper The {@link TermVectorMapper} to process the vector. Must not be null
780 * @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified.
783 abstract public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException;
786 * Map all the term vectors for all fields in a Document
787 * @param docNumber The number of the document to load the vector for
788 * @param mapper The {@link TermVectorMapper} to process the vector. Must not be null
789 * @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified.
791 abstract public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException;
794 * Returns <code>true</code> if an index exists at the specified directory.
795 * @param directory the directory to check for an index
796 * @return <code>true</code> if an index exists; <code>false</code> otherwise
797 * @throws IOException if there is a problem with accessing the index
799 public static boolean indexExists(Directory directory) throws IOException {
801 new SegmentInfos().read(directory);
803 } catch (IOException ioe) {
808 /** Returns the number of documents in this index. */
809 public abstract int numDocs();
811 /** Returns one greater than the largest possible document number.
812 * This may be used to, e.g., determine how big to allocate an array which
813 * will have an element for every document number in an index.
815 public abstract int maxDoc();
817 /** Returns the number of deleted documents. */
818 public int numDeletedDocs() {
819 return maxDoc() - numDocs();
823 * Returns the stored fields of the <code>n</code><sup>th</sup>
824 * <code>Document</code> in this index.
826 * <b>NOTE:</b> for performance reasons, this method does not check if the
827 * requested document is deleted, and therefore asking for a deleted document
828 * may yield unspecified results. Usually this is not required, however you
829 * can call {@link #isDeleted(int)} with the requested document ID to verify
830 * the document is not deleted.
832 * @throws CorruptIndexException if the index is corrupt
833 * @throws IOException if there is a low-level IO error
835 public Document document(int n) throws CorruptIndexException, IOException {
837 if (n < 0 || n >= maxDoc()) {
838 throw new IllegalArgumentException("docID must be >= 0 and < maxDoc=" + maxDoc() + " (got docID=" + n + ")");
840 return document(n, null);
844 * Get the {@link org.apache.lucene.document.Document} at the <code>n</code>
845 * <sup>th</sup> position. The {@link FieldSelector} may be used to determine
846 * what {@link org.apache.lucene.document.Field}s to load and how they should
847 * be loaded. <b>NOTE:</b> If this Reader (more specifically, the underlying
848 * <code>FieldsReader</code>) is closed before the lazy
849 * {@link org.apache.lucene.document.Field} is loaded an exception may be
850 * thrown. If you want the value of a lazy
851 * {@link org.apache.lucene.document.Field} to be available after closing you
852 * must explicitly load it or fetch the Document again with a new loader.
854 * <b>NOTE:</b> for performance reasons, this method does not check if the
855 * requested document is deleted, and therefore asking for a deleted document
856 * may yield unspecified results. Usually this is not required, however you
857 * can call {@link #isDeleted(int)} with the requested document ID to verify
858 * the document is not deleted.
860 * @param n Get the document at the <code>n</code><sup>th</sup> position
861 * @param fieldSelector The {@link FieldSelector} to use to determine what
862 * Fields should be loaded on the Document. May be null, in which case
863 * all Fields will be loaded.
864 * @return The stored fields of the
865 * {@link org.apache.lucene.document.Document} at the nth position
866 * @throws CorruptIndexException if the index is corrupt
867 * @throws IOException if there is a low-level IO error
868 * @see org.apache.lucene.document.Fieldable
869 * @see org.apache.lucene.document.FieldSelector
870 * @see org.apache.lucene.document.SetBasedFieldSelector
871 * @see org.apache.lucene.document.LoadFirstFieldSelector
873 // TODO (1.5): When we convert to JDK 1.5 make this Set<String>
874 public abstract Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException;
876 /** Returns true if document <i>n</i> has been deleted */
877 public abstract boolean isDeleted(int n);
879 /** Returns true if any documents have been deleted */
880 public abstract boolean hasDeletions();
882 /** Returns true if there are norms stored for this field. */
883 public boolean hasNorms(String field) throws IOException {
884 // backward compatible implementation.
885 // SegmentReader has an efficient implementation.
887 return norms(field) != null;
890 /** Returns the byte-encoded normalization factor for the named field of
891 * every document. This is used by the search code to score documents.
892 * Returns null if norms were not indexed for this field.
894 * @see org.apache.lucene.document.Field#setBoost(float)
896 public abstract byte[] norms(String field) throws IOException;
898 /** Reads the byte-encoded normalization factor for the named field of every
899 * document. This is used by the search code to score documents.
901 * @see org.apache.lucene.document.Field#setBoost(float)
903 public abstract void norms(String field, byte[] bytes, int offset)
906 /** Expert: Resets the normalization factor for the named field of the named
907 * document. The norm represents the product of the field's {@link
908 * org.apache.lucene.document.Fieldable#setBoost(float) boost} and its {@link Similarity#lengthNorm(String,
909 * int) length normalization}. Thus, to preserve the length normalization
910 * values when resetting this, one should base the new value upon the old.
912 * <b>NOTE:</b> If this field does not index norms, then
913 * this method throws {@link IllegalStateException}.
915 * @see #norms(String)
916 * @see Similarity#decodeNormValue(byte)
917 * @throws StaleReaderException if the index has changed
918 * since this reader was opened
919 * @throws CorruptIndexException if the index is corrupt
920 * @throws LockObtainFailedException if another writer
921 * has this index open (<code>write.lock</code> could not
923 * @throws IOException if there is a low-level IO error
924 * @throws IllegalStateException if the field does not index norms
926 public synchronized void setNorm(int doc, String field, byte value)
927 throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException {
931 doSetNorm(doc, field, value);
934 /** Implements setNorm in subclass.*/
935 protected abstract void doSetNorm(int doc, String field, byte value)
936 throws CorruptIndexException, IOException;
938 /** Expert: Resets the normalization factor for the named field of the named
941 * @see #norms(String)
942 * @see Similarity#decodeNormValue(byte)
944 * @throws StaleReaderException if the index has changed
945 * since this reader was opened
946 * @throws CorruptIndexException if the index is corrupt
947 * @throws LockObtainFailedException if another writer
948 * has this index open (<code>write.lock</code> could not
950 * @throws IOException if there is a low-level IO error
951 * @deprecated Use {@link #setNorm(int, String, byte)} instead, encoding the
952 * float to byte with your Similarity's {@link Similarity#encodeNormValue(float)}.
953 * This method will be removed in Lucene 4.0
956 public void setNorm(int doc, String field, float value)
957 throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException {
959 setNorm(doc, field, Similarity.getDefault().encodeNormValue(value));
962 /** Returns an enumeration of all the terms in the index. The
963 * enumeration is ordered by Term.compareTo(). Each term is greater
964 * than all that precede it in the enumeration. Note that after
965 * calling terms(), {@link TermEnum#next()} must be called
966 * on the resulting enumeration before calling other methods such as
967 * {@link TermEnum#term()}.
968 * @throws IOException if there is a low-level IO error
970 public abstract TermEnum terms() throws IOException;
972 /** Returns an enumeration of all terms starting at a given term. If
973 * the given term does not exist, the enumeration is positioned at the
974 * first term greater than the supplied term. The enumeration is
975 * ordered by Term.compareTo(). Each term is greater than all that
976 * precede it in the enumeration.
977 * @throws IOException if there is a low-level IO error
979 public abstract TermEnum terms(Term t) throws IOException;
981 /** Returns the number of documents containing the term <code>t</code>.
982 * @throws IOException if there is a low-level IO error
984 public abstract int docFreq(Term t) throws IOException;
986 /** Returns an enumeration of all the documents which contain
987 * <code>term</code>. For each document, the document number, the frequency of
988 * the term in that document is also provided, for use in
989 * search scoring. If term is null, then all non-deleted
990 * docs are returned with freq=1.
991 * Thus, this method implements the mapping:
993 * Term => <docNum, freq><sup>*</sup>
995 * <p>The enumeration is ordered by document number. Each document number
996 * is greater than all that precede it in the enumeration.
997 * @throws IOException if there is a low-level IO error
999 public TermDocs termDocs(Term term) throws IOException {
1001 TermDocs termDocs = termDocs();
1002 termDocs.seek(term);
1006 /** Returns an unpositioned {@link TermDocs} enumerator.
1008 * Note: the TermDocs returned is unpositioned. Before using it, ensure
1009 * that you first position it with {@link TermDocs#seek(Term)} or
1010 * {@link TermDocs#seek(TermEnum)}.
1012 * @throws IOException if there is a low-level IO error
1014 public abstract TermDocs termDocs() throws IOException;
1016 /** Returns an enumeration of all the documents which contain
1017 * <code>term</code>. For each document, in addition to the document number
1018 * and frequency of the term in that document, a list of all of the ordinal
1019 * positions of the term in the document is available. Thus, this method
1020 * implements the mapping:
1023 * Term => <docNum, freq,
1024 * <pos<sub>1</sub>, pos<sub>2</sub>, ...
1025 * pos<sub>freq-1</sub>>
1028 * <p> This positional information facilitates phrase and proximity searching.
1029 * <p>The enumeration is ordered by document number. Each document number is
1030 * greater than all that precede it in the enumeration.
1031 * @throws IOException if there is a low-level IO error
1033 public TermPositions termPositions(Term term) throws IOException {
1035 TermPositions termPositions = termPositions();
1036 termPositions.seek(term);
1037 return termPositions;
1040 /** Returns an unpositioned {@link TermPositions} enumerator.
1041 * @throws IOException if there is a low-level IO error
1043 public abstract TermPositions termPositions() throws IOException;
1047 /** Deletes the document numbered <code>docNum</code>. Once a document is
1048 * deleted it will not appear in TermDocs or TermPostitions enumerations.
1049 * Attempts to read its field with the {@link #document}
1050 * method will result in an error. The presence of this document may still be
1051 * reflected in the {@link #docFreq} statistic, though
1052 * this will be corrected eventually as the index is further modified.
1054 * @throws StaleReaderException if the index has changed
1055 * since this reader was opened
1056 * @throws CorruptIndexException if the index is corrupt
1057 * @throws LockObtainFailedException if another writer
1058 * has this index open (<code>write.lock</code> could not
1060 * @throws IOException if there is a low-level IO error
1062 public synchronized void deleteDocument(int docNum) throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException {
1070 /** Implements deletion of the document numbered <code>docNum</code>.
1071 * Applications should call {@link #deleteDocument(int)} or {@link #deleteDocuments(Term)}.
1073 protected abstract void doDelete(int docNum) throws CorruptIndexException, IOException;
1076 /** Deletes all documents that have a given <code>term</code> indexed.
1077 * This is useful if one uses a document field to hold a unique ID string for
1078 * the document. Then to delete such a document, one merely constructs a
1079 * term with the appropriate field and the unique ID string as its text and
1080 * passes it to this method.
1081 * See {@link #deleteDocument(int)} for information about when this deletion will
1084 * @return the number of documents deleted
1085 * @throws StaleReaderException if the index has changed
1086 * since this reader was opened
1087 * @throws CorruptIndexException if the index is corrupt
1088 * @throws LockObtainFailedException if another writer
1089 * has this index open (<code>write.lock</code> could not
1091 * @throws IOException if there is a low-level IO error
1093 public int deleteDocuments(Term term) throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException {
1095 TermDocs docs = termDocs(term);
1096 if (docs == null) return 0;
1099 while (docs.next()) {
1100 deleteDocument(docs.doc());
1109 /** Undeletes all documents currently marked as deleted in
1112 * <p>NOTE: this method can only recover documents marked
1113 * for deletion but not yet removed from the index; when
1114 * and how Lucene removes deleted documents is an
1115 * implementation detail, subject to change from release
1116 * to release. However, you can use {@link
1117 * #numDeletedDocs} on the current IndexReader instance to
1118 * see how many documents will be un-deleted.
1120 * @throws StaleReaderException if the index has changed
1121 * since this reader was opened
1122 * @throws LockObtainFailedException if another writer
1123 * has this index open (<code>write.lock</code> could not
1125 * @throws CorruptIndexException if the index is corrupt
1126 * @throws IOException if there is a low-level IO error
1128 public synchronized void undeleteAll() throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException {
1135 /** Implements actual undeleteAll() in subclass. */
1136 protected abstract void doUndeleteAll() throws CorruptIndexException, IOException;
1138 /** Does nothing by default. Subclasses that require a write lock for
1139 * index modifications must implement this method. */
1140 protected synchronized void acquireWriteLock() throws IOException {
1146 * @throws IOException
1148 public final synchronized void flush() throws IOException {
1154 * @param commitUserData Opaque Map (String -> String)
1155 * that's recorded into the segments file in the index,
1156 * and retrievable by {@link
1157 * IndexReader#getCommitUserData}.
1158 * @throws IOException
1160 public final synchronized void flush(Map<String, String> commitUserData) throws IOException {
1162 commit(commitUserData);
1166 * Commit changes resulting from delete, undeleteAll, or
1167 * setNorm operations
1169 * If an exception is hit, then either no changes or all
1170 * changes will have been committed to the index
1171 * (transactional semantics).
1172 * @throws IOException if there is a low-level IO error
1174 protected final synchronized void commit() throws IOException {
1179 * Commit changes resulting from delete, undeleteAll, or
1180 * setNorm operations
1182 * If an exception is hit, then either no changes or all
1183 * changes will have been committed to the index
1184 * (transactional semantics).
1185 * @throws IOException if there is a low-level IO error
1187 public final synchronized void commit(Map<String, String> commitUserData) throws IOException {
1189 doCommit(commitUserData);
1194 /** Implements commit. */
1195 protected abstract void doCommit(Map<String, String> commitUserData) throws IOException;
1198 * Closes files associated with this index.
1199 * Also saves any new deletions to disk.
1200 * No other methods should be called after this has been called.
1201 * @throws IOException if there is a low-level IO error
1203 public final synchronized void close() throws IOException {
1210 /** Implements close. */
1211 protected abstract void doClose() throws IOException;
1215 * Get a list of unique field names that exist in this index and have the specified
1216 * field option information.
1217 * @param fldOption specifies which field option should be available for the returned fields
1218 * @return Collection of Strings indicating the names of the fields.
1219 * @see IndexReader.FieldOption
1221 public abstract Collection<String> getFieldNames(FieldOption fldOption);
1224 * Expert: return the IndexCommit that this reader has
1225 * opened. This method is only implemented by those
1226 * readers that correspond to a Directory with its own
1229 * @lucene.experimental
1231 public IndexCommit getIndexCommit() throws IOException {
1232 throw new UnsupportedOperationException("This reader does not support this method.");
1236 * Prints the filename and size of each file within a given compound file.
1237 * Add the -extract flag to extract files to the current working directory.
1238 * In order to make the extracted version of the index work, you have to copy
1239 * the segments file from the compound index into the directory where the extracted files are stored.
1240 * @param args Usage: org.apache.lucene.index.IndexReader [-extract] <cfsfile>
1242 public static void main(String [] args) {
1243 String filename = null;
1244 boolean extract = false;
1246 for (int i = 0; i < args.length; ++i) {
1247 if (args[i].equals("-extract")) {
1249 } else if (filename == null) {
1254 if (filename == null) {
1255 System.out.println("Usage: org.apache.lucene.index.IndexReader [-extract] <cfsfile>");
1259 Directory dir = null;
1260 CompoundFileReader cfr = null;
1263 File file = new File(filename);
1264 String dirname = file.getAbsoluteFile().getParent();
1265 filename = file.getName();
1266 dir = FSDirectory.open(new File(dirname));
1267 cfr = new CompoundFileReader(dir, filename);
1269 String [] files = cfr.listAll();
1270 ArrayUtil.mergeSort(files); // sort the array of filename so that the output is more readable
1272 for (int i = 0; i < files.length; ++i) {
1273 long len = cfr.fileLength(files[i]);
1276 System.out.println("extract " + files[i] + " with " + len + " bytes to local directory...");
1277 IndexInput ii = cfr.openInput(files[i]);
1279 FileOutputStream f = new FileOutputStream(files[i]);
1281 // read and write with a small buffer, which is more effective than reading byte by byte
1282 byte[] buffer = new byte[1024];
1283 int chunk = buffer.length;
1285 final int bufLen = (int) Math.min(chunk, len);
1286 ii.readBytes(buffer, 0, bufLen);
1287 f.write(buffer, 0, bufLen);
1295 System.out.println(files[i] + ": " + len + " bytes");
1297 } catch (IOException ioe) {
1298 ioe.printStackTrace();
1307 catch (IOException ioe) {
1308 ioe.printStackTrace();
1313 /** Returns all commit points that exist in the Directory.
1314 * Normally, because the default is {@link
1315 * KeepOnlyLastCommitDeletionPolicy}, there would be only
1316 * one commit point. But if you're using a custom {@link
1317 * IndexDeletionPolicy} then there could be many commits.
1318 * Once you have a given commit, you can open a reader on
1319 * it by calling {@link IndexReader#open(IndexCommit,boolean)}
1320 * There must be at least one commit in
1321 * the Directory, else this method throws {@link
1322 * IndexNotFoundException}. Note that if a commit is in
1323 * progress while this method is running, that commit
1324 * may or may not be returned.
1326 * @return a sorted list of {@link IndexCommit}s, from oldest
1328 public static Collection<IndexCommit> listCommits(Directory dir) throws IOException {
1329 return DirectoryReader.listCommits(dir);
1332 /** Expert: returns the sequential sub readers that this
1333 * reader is logically composed of. For example,
1334 * IndexSearcher uses this API to drive searching by one
1335 * sub reader at a time. If this reader is not composed
1336 * of sequential child readers, it should return null.
1337 * If this method returns an empty array, that means this
1338 * reader is a null reader (for example a MultiReader
1339 * that has no sub readers).
1341 * NOTE: You should not try using sub-readers returned by
1342 * this method to make any changes (setNorm, deleteDocument,
1343 * etc.). While this might succeed for one composite reader
1344 * (like MultiReader), it will most likely lead to index
1345 * corruption for other readers (like DirectoryReader obtained
1346 * through {@link #open}. Use the parent reader directly. */
1347 public IndexReader[] getSequentialSubReaders() {
1352 public Object getCoreCacheKey() {
1356 /** Expert. Warning: this returns null if the reader has
1358 public Object getDeletesCacheKey() {
1362 /** Returns the number of unique terms (across all fields)
1365 * This method returns long, even though internally
1366 * Lucene cannot handle more than 2^31 unique terms, for
1367 * a possible future when this limitation is removed.
1369 * @throws UnsupportedOperationException if this count
1370 * cannot be easily determined (eg Multi*Readers).
1371 * Instead, you should call {@link
1372 * #getSequentialSubReaders} and ask each sub reader for
1373 * its unique term count. */
1374 public long getUniqueTermCount() throws IOException {
1375 throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
1378 /** For IndexReader implementations that use
1379 * TermInfosReader to read terms, this returns the
1380 * current indexDivisor as specified when the reader was
1383 public int getTermInfosIndexDivisor() {
1384 throw new UnsupportedOperationException("This reader does not support this method.");