+++ /dev/null
-package org.apache.lucene.index;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.util.StringHelper;
-import org.apache.lucene.document.AbstractField; // for javadocs
-import org.apache.lucene.document.Document;
-
-import java.text.NumberFormat;
-import java.io.PrintStream;
-import java.io.IOException;
-import java.io.File;
-import java.util.Collection;
-
-import java.util.Comparator;
-import java.util.List;
-import java.util.ArrayList;
-import java.util.Map;
-
-/**
- * Basic tool and API to check the health of an index and
- * write a new segments file that removes reference to
- * problematic segments.
- *
- * <p>As this tool checks every byte in the index, on a large
- * index it can take quite a long time to run.
- *
- * @lucene.experimental Please make a complete backup of your
- * index before using this to fix your index!
- */
-public class CheckIndex {
-
- private PrintStream infoStream;
- private Directory dir;
-
- /**
- * Returned from {@link #checkIndex()} detailing the health and status of the index.
- *
- * @lucene.experimental
- **/
-
- public static class Status {
-
- /** True if no problems were found with the index. */
- public boolean clean;
-
- /** True if we were unable to locate and load the segments_N file. */
- public boolean missingSegments;
-
- /** True if we were unable to open the segments_N file. */
- public boolean cantOpenSegments;
-
- /** True if we were unable to read the version number from segments_N file. */
- public boolean missingSegmentVersion;
-
- /** Name of latest segments_N file in the index. */
- public String segmentsFileName;
-
- /** Number of segments in the index. */
- public int numSegments;
-
- /** String description of the version of the index. */
- public String segmentFormat;
-
- /** Empty unless you passed specific segments list to check as optional 3rd argument.
- * @see CheckIndex#checkIndex(List) */
- public List<String> segmentsChecked = new ArrayList<String>();
-
- /** True if the index was created with a newer version of Lucene than the CheckIndex tool. */
- public boolean toolOutOfDate;
-
- /** List of {@link SegmentInfoStatus} instances, detailing status of each segment. */
- public List<SegmentInfoStatus> segmentInfos = new ArrayList<SegmentInfoStatus>();
-
- /** Directory index is in. */
- public Directory dir;
-
- /**
- * SegmentInfos instance containing only segments that
- * had no problems (this is used with the {@link CheckIndex#fixIndex}
- * method to repair the index.
- */
- SegmentInfos newSegments;
-
- /** How many documents will be lost to bad segments. */
- public int totLoseDocCount;
-
- /** How many bad segments were found. */
- public int numBadSegments;
-
- /** True if we checked only specific segments ({@link
- * #checkIndex(List)}) was called with non-null
- * argument). */
- public boolean partial;
-
- /** The greatest segment name. */
- public int maxSegmentName;
-
- /** Whether the SegmentInfos.counter is greater than any of the segments' names. */
- public boolean validCounter;
-
- /** Holds the userData of the last commit in the index */
- public Map<String, String> userData;
-
- /** Holds the status of each segment in the index.
- * See {@link #segmentInfos}.
- *
- * <p><b>WARNING</b>: this API is new and experimental and is
- * subject to suddenly change in the next release.
- */
- public static class SegmentInfoStatus {
- /** Name of the segment. */
- public String name;
-
- /** Document count (does not take deletions into account). */
- public int docCount;
-
- /** True if segment is compound file format. */
- public boolean compound;
-
- /** Number of files referenced by this segment. */
- public int numFiles;
-
- /** Net size (MB) of the files referenced by this
- * segment. */
- public double sizeMB;
-
- /** Doc store offset, if this segment shares the doc
- * store files (stored fields and term vectors) with
- * other segments. This is -1 if it does not share. */
- public int docStoreOffset = -1;
-
- /** String of the shared doc store segment, or null if
- * this segment does not share the doc store files. */
- public String docStoreSegment;
-
- /** True if the shared doc store files are compound file
- * format. */
- public boolean docStoreCompoundFile;
-
- /** True if this segment has pending deletions. */
- public boolean hasDeletions;
-
- /** Name of the current deletions file name. */
- public String deletionsFileName;
-
- /** Number of deleted documents. */
- public int numDeleted;
-
- /** True if we were able to open a SegmentReader on this
- * segment. */
- public boolean openReaderPassed;
-
- /** Number of fields in this segment. */
- int numFields;
-
- /** True if at least one of the fields in this segment
- * has position data
- * @see AbstractField#setIndexOptions(org.apache.lucene.index.FieldInfo.IndexOptions) */
- public boolean hasProx;
-
- /** Map that includes certain
- * debugging details that IndexWriter records into
- * each segment it creates */
- public Map<String,String> diagnostics;
-
- /** Status for testing of field norms (null if field norms could not be tested). */
- public FieldNormStatus fieldNormStatus;
-
- /** Status for testing of indexed terms (null if indexed terms could not be tested). */
- public TermIndexStatus termIndexStatus;
-
- /** Status for testing of stored fields (null if stored fields could not be tested). */
- public StoredFieldStatus storedFieldStatus;
-
- /** Status for testing of term vectors (null if term vectors could not be tested). */
- public TermVectorStatus termVectorStatus;
- }
-
- /**
- * Status from testing field norms.
- */
- public static final class FieldNormStatus {
- /** Number of fields successfully tested */
- public long totFields = 0L;
-
- /** Exception thrown during term index test (null on success) */
- public Throwable error = null;
- }
-
- /**
- * Status from testing term index.
- */
- public static final class TermIndexStatus {
- /** Total term count */
- public long termCount = 0L;
-
- /** Total frequency across all terms. */
- public long totFreq = 0L;
-
- /** Total number of positions. */
- public long totPos = 0L;
-
- /** Exception thrown during term index test (null on success) */
- public Throwable error = null;
- }
-
- /**
- * Status from testing stored fields.
- */
- public static final class StoredFieldStatus {
-
- /** Number of documents tested. */
- public int docCount = 0;
-
- /** Total number of stored fields tested. */
- public long totFields = 0;
-
- /** Exception thrown during stored fields test (null on success) */
- public Throwable error = null;
- }
-
- /**
- * Status from testing stored fields.
- */
- public static final class TermVectorStatus {
-
- /** Number of documents tested. */
- public int docCount = 0;
-
- /** Total number of term vectors tested. */
- public long totVectors = 0;
-
- /** Exception thrown during term vector test (null on success) */
- public Throwable error = null;
- }
- }
-
- /** Create a new CheckIndex on the directory. */
- public CheckIndex(Directory dir) {
- this.dir = dir;
- infoStream = null;
- }
-
- /** Set infoStream where messages should go. If null, no
- * messages are printed */
- public void setInfoStream(PrintStream out) {
- infoStream = out;
- }
-
- private void msg(String msg) {
- if (infoStream != null)
- infoStream.println(msg);
- }
-
- private static class MySegmentTermDocs extends SegmentTermDocs {
-
- int delCount;
-
- MySegmentTermDocs(SegmentReader p) {
- super(p);
- }
-
- @Override
- public void seek(Term term) throws IOException {
- super.seek(term);
- delCount = 0;
- }
-
- @Override
- protected void skippingDoc() throws IOException {
- delCount++;
- }
- }
-
- /** Returns a {@link Status} instance detailing
- * the state of the index.
- *
- * <p>As this method checks every byte in the index, on a large
- * index it can take quite a long time to run.
- *
- * <p><b>WARNING</b>: make sure
- * you only call this when the index is not opened by any
- * writer. */
- public Status checkIndex() throws IOException {
- return checkIndex(null);
- }
-
- /** Returns a {@link Status} instance detailing
- * the state of the index.
- *
- * @param onlySegments list of specific segment names to check
- *
- * <p>As this method checks every byte in the specified
- * segments, on a large index it can take quite a long
- * time to run.
- *
- * <p><b>WARNING</b>: make sure
- * you only call this when the index is not opened by any
- * writer. */
- public Status checkIndex(List<String> onlySegments) throws IOException {
- NumberFormat nf = NumberFormat.getInstance();
- SegmentInfos sis = new SegmentInfos();
- Status result = new Status();
- result.dir = dir;
- try {
- sis.read(dir);
- } catch (Throwable t) {
- msg("ERROR: could not read any segments file in directory");
- result.missingSegments = true;
- if (infoStream != null)
- t.printStackTrace(infoStream);
- return result;
- }
-
- // find the oldest and newest segment versions
- String oldest = Integer.toString(Integer.MAX_VALUE), newest = Integer.toString(Integer.MIN_VALUE);
- String oldSegs = null;
- boolean foundNonNullVersion = false;
- Comparator<String> versionComparator = StringHelper.getVersionComparator();
- for (SegmentInfo si : sis) {
- String version = si.getVersion();
- if (version == null) {
- // pre-3.1 segment
- oldSegs = "pre-3.1";
- } else if (version.equals("2.x")) {
- // an old segment that was 'touched' by 3.1+ code
- oldSegs = "2.x";
- } else {
- foundNonNullVersion = true;
- if (versionComparator.compare(version, oldest) < 0) {
- oldest = version;
- }
- if (versionComparator.compare(version, newest) > 0) {
- newest = version;
- }
- }
- }
-
- final int numSegments = sis.size();
- final String segmentsFileName = sis.getCurrentSegmentFileName();
- IndexInput input = null;
- try {
- input = dir.openInput(segmentsFileName);
- } catch (Throwable t) {
- msg("ERROR: could not open segments file in directory");
- if (infoStream != null)
- t.printStackTrace(infoStream);
- result.cantOpenSegments = true;
- return result;
- }
- int format = 0;
- try {
- format = input.readInt();
- } catch (Throwable t) {
- msg("ERROR: could not read segment file version in directory");
- if (infoStream != null)
- t.printStackTrace(infoStream);
- result.missingSegmentVersion = true;
- return result;
- } finally {
- if (input != null)
- input.close();
- }
-
- String sFormat = "";
- boolean skip = false;
-
- if (format == SegmentInfos.FORMAT)
- sFormat = "FORMAT [Lucene Pre-2.1]";
- if (format == SegmentInfos.FORMAT_LOCKLESS)
- sFormat = "FORMAT_LOCKLESS [Lucene 2.1]";
- else if (format == SegmentInfos.FORMAT_SINGLE_NORM_FILE)
- sFormat = "FORMAT_SINGLE_NORM_FILE [Lucene 2.2]";
- else if (format == SegmentInfos.FORMAT_SHARED_DOC_STORE)
- sFormat = "FORMAT_SHARED_DOC_STORE [Lucene 2.3]";
- else {
- if (format == SegmentInfos.FORMAT_CHECKSUM)
- sFormat = "FORMAT_CHECKSUM [Lucene 2.4]";
- else if (format == SegmentInfos.FORMAT_DEL_COUNT)
- sFormat = "FORMAT_DEL_COUNT [Lucene 2.4]";
- else if (format == SegmentInfos.FORMAT_HAS_PROX)
- sFormat = "FORMAT_HAS_PROX [Lucene 2.4]";
- else if (format == SegmentInfos.FORMAT_USER_DATA)
- sFormat = "FORMAT_USER_DATA [Lucene 2.9]";
- else if (format == SegmentInfos.FORMAT_DIAGNOSTICS)
- sFormat = "FORMAT_DIAGNOSTICS [Lucene 2.9]";
- else if (format == SegmentInfos.FORMAT_HAS_VECTORS)
- sFormat = "FORMAT_HAS_VECTORS [Lucene 3.1]";
- else if (format == SegmentInfos.FORMAT_3_1)
- sFormat = "FORMAT_3_1 [Lucene 3.1+]";
- else if (format == SegmentInfos.CURRENT_FORMAT)
- throw new RuntimeException("BUG: You should update this tool!");
- else if (format < SegmentInfos.CURRENT_FORMAT) {
- sFormat = "int=" + format + " [newer version of Lucene than this tool]";
- skip = true;
- } else {
- sFormat = format + " [Lucene 1.3 or prior]";
- }
- }
-
- result.segmentsFileName = segmentsFileName;
- result.numSegments = numSegments;
- result.segmentFormat = sFormat;
- result.userData = sis.getUserData();
- String userDataString;
- if (sis.getUserData().size() > 0) {
- userDataString = " userData=" + sis.getUserData();
- } else {
- userDataString = "";
- }
-
- String versionString = null;
- if (oldSegs != null) {
- if (foundNonNullVersion) {
- versionString = "versions=[" + oldSegs + " .. " + newest + "]";
- } else {
- versionString = "version=" + oldSegs;
- }
- } else {
- versionString = oldest.equals(newest) ? ( "version=" + oldest ) : ("versions=[" + oldest + " .. " + newest + "]");
- }
-
- msg("Segments file=" + segmentsFileName + " numSegments=" + numSegments
- + " " + versionString + " format=" + sFormat + userDataString);
-
- if (onlySegments != null) {
- result.partial = true;
- if (infoStream != null)
- infoStream.print("\nChecking only these segments:");
- for (String s : onlySegments) {
- if (infoStream != null)
- infoStream.print(" " + s);
- }
- result.segmentsChecked.addAll(onlySegments);
- msg(":");
- }
-
- if (skip) {
- msg("\nERROR: this index appears to be created by a newer version of Lucene than this tool was compiled on; please re-compile this tool on the matching version of Lucene; exiting");
- result.toolOutOfDate = true;
- return result;
- }
-
-
- result.newSegments = (SegmentInfos) sis.clone();
- result.newSegments.clear();
- result.maxSegmentName = -1;
-
- for(int i=0;i<numSegments;i++) {
- final SegmentInfo info = sis.info(i);
- int segmentName = Integer.parseInt(info.name.substring(1), Character.MAX_RADIX);
- if (segmentName > result.maxSegmentName) {
- result.maxSegmentName = segmentName;
- }
- if (onlySegments != null && !onlySegments.contains(info.name))
- continue;
- Status.SegmentInfoStatus segInfoStat = new Status.SegmentInfoStatus();
- result.segmentInfos.add(segInfoStat);
- msg(" " + (1+i) + " of " + numSegments + ": name=" + info.name + " docCount=" + info.docCount);
- segInfoStat.name = info.name;
- segInfoStat.docCount = info.docCount;
-
- int toLoseDocCount = info.docCount;
-
- SegmentReader reader = null;
-
- try {
- msg(" compound=" + info.getUseCompoundFile());
- segInfoStat.compound = info.getUseCompoundFile();
- msg(" hasProx=" + info.getHasProx());
- segInfoStat.hasProx = info.getHasProx();
- msg(" numFiles=" + info.files().size());
- segInfoStat.numFiles = info.files().size();
- segInfoStat.sizeMB = info.sizeInBytes(true)/(1024.*1024.);
- msg(" size (MB)=" + nf.format(segInfoStat.sizeMB));
- Map<String,String> diagnostics = info.getDiagnostics();
- segInfoStat.diagnostics = diagnostics;
- if (diagnostics.size() > 0) {
- msg(" diagnostics = " + diagnostics);
- }
-
- final int docStoreOffset = info.getDocStoreOffset();
- if (docStoreOffset != -1) {
- msg(" docStoreOffset=" + docStoreOffset);
- segInfoStat.docStoreOffset = docStoreOffset;
- msg(" docStoreSegment=" + info.getDocStoreSegment());
- segInfoStat.docStoreSegment = info.getDocStoreSegment();
- msg(" docStoreIsCompoundFile=" + info.getDocStoreIsCompoundFile());
- segInfoStat.docStoreCompoundFile = info.getDocStoreIsCompoundFile();
- }
- final String delFileName = info.getDelFileName();
- if (delFileName == null){
- msg(" no deletions");
- segInfoStat.hasDeletions = false;
- }
- else{
- msg(" has deletions [delFileName=" + delFileName + "]");
- segInfoStat.hasDeletions = true;
- segInfoStat.deletionsFileName = delFileName;
- }
- if (infoStream != null)
- infoStream.print(" test: open reader.........");
- reader = SegmentReader.get(true, info, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR);
-
- segInfoStat.openReaderPassed = true;
-
- final int numDocs = reader.numDocs();
- toLoseDocCount = numDocs;
- if (reader.hasDeletions()) {
- if (reader.deletedDocs.count() != info.getDelCount()) {
- throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs deletedDocs.count()=" + reader.deletedDocs.count());
- }
- if (reader.deletedDocs.count() > reader.maxDoc()) {
- throw new RuntimeException("too many deleted docs: maxDoc()=" + reader.maxDoc() + " vs deletedDocs.count()=" + reader.deletedDocs.count());
- }
- if (info.docCount - numDocs != info.getDelCount()){
- throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs reader=" + (info.docCount - numDocs));
- }
- segInfoStat.numDeleted = info.docCount - numDocs;
- msg("OK [" + (segInfoStat.numDeleted) + " deleted docs]");
- } else {
- if (info.getDelCount() != 0) {
- throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs reader=" + (info.docCount - numDocs));
- }
- msg("OK");
- }
- if (reader.maxDoc() != info.docCount)
- throw new RuntimeException("SegmentReader.maxDoc() " + reader.maxDoc() + " != SegmentInfos.docCount " + info.docCount);
-
- // Test getFieldNames()
- if (infoStream != null) {
- infoStream.print(" test: fields..............");
- }
- Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);
- msg("OK [" + fieldNames.size() + " fields]");
- segInfoStat.numFields = fieldNames.size();
-
- // Test Field Norms
- segInfoStat.fieldNormStatus = testFieldNorms(fieldNames, reader);
-
- // Test the Term Index
- segInfoStat.termIndexStatus = testTermIndex(info, reader);
-
- // Test Stored Fields
- segInfoStat.storedFieldStatus = testStoredFields(info, reader, nf);
-
- // Test Term Vectors
- segInfoStat.termVectorStatus = testTermVectors(info, reader, nf);
-
- // Rethrow the first exception we encountered
- // This will cause stats for failed segments to be incremented properly
- if (segInfoStat.fieldNormStatus.error != null) {
- throw new RuntimeException("Field Norm test failed");
- } else if (segInfoStat.termIndexStatus.error != null) {
- throw new RuntimeException("Term Index test failed");
- } else if (segInfoStat.storedFieldStatus.error != null) {
- throw new RuntimeException("Stored Field test failed");
- } else if (segInfoStat.termVectorStatus.error != null) {
- throw new RuntimeException("Term Vector test failed");
- }
-
- msg("");
-
- } catch (Throwable t) {
- msg("FAILED");
- String comment;
- comment = "fixIndex() would remove reference to this segment";
- msg(" WARNING: " + comment + "; full exception:");
- if (infoStream != null)
- t.printStackTrace(infoStream);
- msg("");
- result.totLoseDocCount += toLoseDocCount;
- result.numBadSegments++;
- continue;
- } finally {
- if (reader != null)
- reader.close();
- }
-
- // Keeper
- result.newSegments.add((SegmentInfo) info.clone());
- }
-
- if (0 == result.numBadSegments) {
- result.clean = true;
- } else
- msg("WARNING: " + result.numBadSegments + " broken segments (containing " + result.totLoseDocCount + " documents) detected");
-
- if ( ! (result.validCounter = (result.maxSegmentName < sis.counter))) {
- result.clean = false;
- result.newSegments.counter = result.maxSegmentName + 1;
- msg("ERROR: Next segment name counter " + sis.counter + " is not greater than max segment name " + result.maxSegmentName);
- }
-
- if (result.clean) {
- msg("No problems were detected with this index.\n");
- }
-
- return result;
- }
-
- /**
- * Test field norms.
- */
- private Status.FieldNormStatus testFieldNorms(Collection<String> fieldNames, SegmentReader reader) {
- final Status.FieldNormStatus status = new Status.FieldNormStatus();
-
- try {
- // Test Field Norms
- if (infoStream != null) {
- infoStream.print(" test: field norms.........");
- }
- final byte[] b = new byte[reader.maxDoc()];
- for (final String fieldName : fieldNames) {
- if (reader.hasNorms(fieldName)) {
- reader.norms(fieldName, b, 0);
- ++status.totFields;
- }
- }
-
- msg("OK [" + status.totFields + " fields]");
- } catch (Throwable e) {
- msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
- status.error = e;
- if (infoStream != null) {
- e.printStackTrace(infoStream);
- }
- }
-
- return status;
- }
-
- /**
- * Test the term index.
- */
- private Status.TermIndexStatus testTermIndex(SegmentInfo info, SegmentReader reader) {
- final Status.TermIndexStatus status = new Status.TermIndexStatus();
-
- final IndexSearcher is = new IndexSearcher(reader);
-
- try {
- if (infoStream != null) {
- infoStream.print(" test: terms, freq, prox...");
- }
-
- final TermEnum termEnum = reader.terms();
- final TermPositions termPositions = reader.termPositions();
-
- // Used only to count up # deleted docs for this term
- final MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader);
-
- final int maxDoc = reader.maxDoc();
- Term lastTerm = null;
- while (termEnum.next()) {
- status.termCount++;
- final Term term = termEnum.term();
- lastTerm = term;
-
- final int docFreq = termEnum.docFreq();
- termPositions.seek(term);
- int lastDoc = -1;
- int freq0 = 0;
- status.totFreq += docFreq;
- while (termPositions.next()) {
- freq0++;
- final int doc = termPositions.doc();
- final int freq = termPositions.freq();
- if (doc <= lastDoc)
- throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
- if (doc >= maxDoc)
- throw new RuntimeException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc);
-
- lastDoc = doc;
- if (freq <= 0)
- throw new RuntimeException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds");
-
- int lastPos = -1;
- status.totPos += freq;
- for(int j=0;j<freq;j++) {
- final int pos = termPositions.nextPosition();
- if (pos < -1)
- throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");
- if (pos < lastPos)
- throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos);
- lastPos = pos;
- }
- }
-
- // Test skipping
- for(int idx=0;idx<7;idx++) {
- final int skipDocID = (int) (((idx+1)*(long) maxDoc)/8);
- termPositions.seek(term);
- if (!termPositions.skipTo(skipDocID)) {
- break;
- } else {
-
- final int docID = termPositions.doc();
- if (docID < skipDocID) {
- throw new RuntimeException("term " + term + ": skipTo(docID=" + skipDocID + ") returned docID=" + docID);
- }
- final int freq = termPositions.freq();
- if (freq <= 0) {
- throw new RuntimeException("termFreq " + freq + " is out of bounds");
- }
- int lastPosition = -1;
- for(int posUpto=0;posUpto<freq;posUpto++) {
- final int pos = termPositions.nextPosition();
- if (pos < 0) {
- throw new RuntimeException("position " + pos + " is out of bounds");
- }
- // TODO: we should assert when all pos == 0 that positions are actually omitted
- if (pos < lastPosition) {
- throw new RuntimeException("position " + pos + " is < lastPosition " + lastPosition);
- }
- lastPosition = pos;
- }
-
- if (!termPositions.next()) {
- break;
- }
- final int nextDocID = termPositions.doc();
- if (nextDocID <= docID) {
- throw new RuntimeException("term " + term + ": skipTo(docID=" + skipDocID + "), then .next() returned docID=" + nextDocID + " vs prev docID=" + docID);
- }
- }
- }
-
- // Now count how many deleted docs occurred in
- // this term:
- final int delCount;
- if (reader.hasDeletions()) {
- myTermDocs.seek(term);
- while(myTermDocs.next()) { }
- delCount = myTermDocs.delCount;
- } else {
- delCount = 0;
- }
-
- if (freq0 + delCount != docFreq) {
- throw new RuntimeException("term " + term + " docFreq=" +
- docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount);
- }
- }
-
- // Test search on last term:
- if (lastTerm != null) {
- is.search(new TermQuery(lastTerm), 1);
- }
-
- msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]");
-
- } catch (Throwable e) {
- msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
- status.error = e;
- if (infoStream != null) {
- e.printStackTrace(infoStream);
- }
- }
-
- return status;
- }
-
- /**
- * Test stored fields for a segment.
- */
- private Status.StoredFieldStatus testStoredFields(SegmentInfo info, SegmentReader reader, NumberFormat format) {
- final Status.StoredFieldStatus status = new Status.StoredFieldStatus();
-
- try {
- if (infoStream != null) {
- infoStream.print(" test: stored fields.......");
- }
-
- // Scan stored fields for all documents
- for (int j = 0; j < info.docCount; ++j) {
- if (!reader.isDeleted(j)) {
- status.docCount++;
- Document doc = reader.document(j);
- status.totFields += doc.getFields().size();
- }
- }
-
- // Validate docCount
- if (status.docCount != reader.numDocs()) {
- throw new RuntimeException("docCount=" + status.docCount + " but saw " + status.docCount + " undeleted docs");
- }
-
- msg("OK [" + status.totFields + " total field count; avg " +
- format.format((((float) status.totFields)/status.docCount)) + " fields per doc]");
- } catch (Throwable e) {
- msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
- status.error = e;
- if (infoStream != null) {
- e.printStackTrace(infoStream);
- }
- }
-
- return status;
- }
-
- /**
- * Test term vectors for a segment.
- */
- private Status.TermVectorStatus testTermVectors(SegmentInfo info, SegmentReader reader, NumberFormat format) {
- final Status.TermVectorStatus status = new Status.TermVectorStatus();
-
- try {
- if (infoStream != null) {
- infoStream.print(" test: term vectors........");
- }
-
- for (int j = 0; j < info.docCount; ++j) {
- if (!reader.isDeleted(j)) {
- status.docCount++;
- TermFreqVector[] tfv = reader.getTermFreqVectors(j);
- if (tfv != null) {
- status.totVectors += tfv.length;
- }
- }
- }
-
- msg("OK [" + status.totVectors + " total vector count; avg " +
- format.format((((float) status.totVectors) / status.docCount)) + " term/freq vector fields per doc]");
- } catch (Throwable e) {
- msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
- status.error = e;
- if (infoStream != null) {
- e.printStackTrace(infoStream);
- }
- }
-
- return status;
- }
-
- /** Repairs the index using previously returned result
- * from {@link #checkIndex}. Note that this does not
- * remove any of the unreferenced files after it's done;
- * you must separately open an {@link IndexWriter}, which
- * deletes unreferenced files when it's created.
- *
- * <p><b>WARNING</b>: this writes a
- * new segments file into the index, effectively removing
- * all documents in broken segments from the index.
- * BE CAREFUL.
- *
- * <p><b>WARNING</b>: Make sure you only call this when the
- * index is not opened by any writer. */
- public void fixIndex(Status result) throws IOException {
- if (result.partial)
- throw new IllegalArgumentException("can only fix an index that was fully checked (this status checked a subset of segments)");
- result.newSegments.changed();
- result.newSegments.commit(result.dir);
- }
-
- private static boolean assertsOn;
-
- private static boolean testAsserts() {
- assertsOn = true;
- return true;
- }
-
- private static boolean assertsOn() {
- assert testAsserts();
- return assertsOn;
- }
-
- /** Command-line interface to check and fix an index.
-
- <p>
- Run it like this:
- <pre>
- java -ea:org.apache.lucene... org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-segment X] [-segment Y]
- </pre>
- <ul>
- <li><code>-fix</code>: actually write a new segments_N file, removing any problematic segments
-
- <li><code>-segment X</code>: only check the specified
- segment(s). This can be specified multiple times,
- to check more than one segment, eg <code>-segment _2
- -segment _a</code>. You can't use this with the -fix
- option.
- </ul>
-
- <p><b>WARNING</b>: <code>-fix</code> should only be used on an emergency basis as it will cause
- documents (perhaps many) to be permanently removed from the index. Always make
- a backup copy of your index before running this! Do not run this tool on an index
- that is actively being written to. You have been warned!
-
- <p> Run without -fix, this tool will open the index, report version information
- and report any exceptions it hits and what action it would take if -fix were
- specified. With -fix, this tool will remove any segments that have issues and
- write a new segments_N file. This means all documents contained in the affected
- segments will be removed.
-
- <p>
- This tool exits with exit code 1 if the index cannot be opened or has any
- corruption, else 0.
- */
- public static void main(String[] args) throws IOException, InterruptedException {
-
- boolean doFix = false;
- List<String> onlySegments = new ArrayList<String>();
- String indexPath = null;
- int i = 0;
- while(i < args.length) {
- if (args[i].equals("-fix")) {
- doFix = true;
- i++;
- } else if (args[i].equals("-segment")) {
- if (i == args.length-1) {
- System.out.println("ERROR: missing name for -segment option");
- System.exit(1);
- }
- onlySegments.add(args[i+1]);
- i += 2;
- } else {
- if (indexPath != null) {
- System.out.println("ERROR: unexpected extra argument '" + args[i] + "'");
- System.exit(1);
- }
- indexPath = args[i];
- i++;
- }
- }
-
- if (indexPath == null) {
- System.out.println("\nERROR: index path not specified");
- System.out.println("\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-segment X] [-segment Y]\n" +
- "\n" +
- " -fix: actually write a new segments_N file, removing any problematic segments\n" +
- " -segment X: only check the specified segments. This can be specified multiple\n" +
- " times, to check more than one segment, eg '-segment _2 -segment _a'.\n" +
- " You can't use this with the -fix option\n" +
- "\n" +
- "**WARNING**: -fix should only be used on an emergency basis as it will cause\n" +
- "documents (perhaps many) to be permanently removed from the index. Always make\n" +
- "a backup copy of your index before running this! Do not run this tool on an index\n" +
- "that is actively being written to. You have been warned!\n" +
- "\n" +
- "Run without -fix, this tool will open the index, report version information\n" +
- "and report any exceptions it hits and what action it would take if -fix were\n" +
- "specified. With -fix, this tool will remove any segments that have issues and\n" +
- "write a new segments_N file. This means all documents contained in the affected\n" +
- "segments will be removed.\n" +
- "\n" +
- "This tool exits with exit code 1 if the index cannot be opened or has any\n" +
- "corruption, else 0.\n");
- System.exit(1);
- }
-
- if (!assertsOn())
- System.out.println("\nNOTE: testing will be more thorough if you run java with '-ea:org.apache.lucene...', so assertions are enabled");
-
- if (onlySegments.size() == 0)
- onlySegments = null;
- else if (doFix) {
- System.out.println("ERROR: cannot specify both -fix and -segment");
- System.exit(1);
- }
-
- System.out.println("\nOpening index @ " + indexPath + "\n");
- Directory dir = null;
- try {
- dir = FSDirectory.open(new File(indexPath));
- } catch (Throwable t) {
- System.out.println("ERROR: could not open directory \"" + indexPath + "\"; exiting");
- t.printStackTrace(System.out);
- System.exit(1);
- }
-
- CheckIndex checker = new CheckIndex(dir);
- checker.setInfoStream(System.out);
-
- Status result = checker.checkIndex(onlySegments);
- if (result.missingSegments) {
- System.exit(1);
- }
-
- if (!result.clean) {
- if (!doFix) {
- System.out.println("WARNING: would write new segments file, and " + result.totLoseDocCount + " documents would be lost, if -fix were specified\n");
- } else {
- System.out.println("WARNING: " + result.totLoseDocCount + " documents will be lost\n");
- System.out.println("NOTE: will write new segments file in 5 seconds; this will remove " + result.totLoseDocCount + " docs from the index. THIS IS YOUR LAST CHANCE TO CTRL+C!");
- for(int s=0;s<5;s++) {
- Thread.sleep(1000);
- System.out.println(" " + (5-s) + "...");
- }
- System.out.println("Writing...");
- checker.fixIndex(result);
- System.out.println("OK");
- System.out.println("Wrote new segments file \"" + result.newSegments.getCurrentSegmentFileName() + "\"");
- }
- }
- System.out.println("");
-
- final int exitCode;
- if (result.clean == true)
- exitCode = 0;
- else
- exitCode = 1;
- System.exit(exitCode);
- }
-}