lucene-java-3.4.0/lucene/src/java/org/apache/lucene/index/CheckIndex.java

   1 package org.apache.lucene.index;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.search.IndexSearcher;
  21 import org.apache.lucene.search.TermQuery;
  22 import org.apache.lucene.store.FSDirectory;
  23 import org.apache.lucene.store.Directory;
  24 import org.apache.lucene.store.IndexInput;
  25 import org.apache.lucene.util.StringHelper;
  26 import org.apache.lucene.document.AbstractField;  // for javadocs
  27 import org.apache.lucene.document.Document;
  28
  29 import java.text.NumberFormat;
  30 import java.io.PrintStream;
  31 import java.io.IOException;
  32 import java.io.File;
  33 import java.util.Collection;
  34
  35 import java.util.Comparator;
  36 import java.util.List;
  37 import java.util.ArrayList;
  38 import java.util.Map;
  39
  40 /**
  41  * Basic tool and API to check the health of an index and
  42  * write a new segments file that removes reference to
  43  * problematic segments.
  44  *
  45  * <p>As this tool checks every byte in the index, on a large
  46  * index it can take quite a long time to run.
  47  *
  48  * @lucene.experimental Please make a complete backup of your
  49  * index before using this to fix your index!
  50  */
  51 public class CheckIndex {
  52
  53   private PrintStream infoStream;
  54   private Directory dir;
  55
  56   /**
  57    * Returned from {@link #checkIndex()} detailing the health and status of the index.
  58    *
  59    * @lucene.experimental
  60    **/
  61
  62   public static class Status {
  63
  64     /** True if no problems were found with the index. */
  65     public boolean clean;
  66
  67     /** True if we were unable to locate and load the segments_N file. */
  68     public boolean missingSegments;
  69
  70     /** True if we were unable to open the segments_N file. */
  71     public boolean cantOpenSegments;
  72
  73     /** True if we were unable to read the version number from segments_N file. */
  74     public boolean missingSegmentVersion;
  75
  76     /** Name of latest segments_N file in the index. */
  77     public String segmentsFileName;
  78
  79     /** Number of segments in the index. */
  80     public int numSegments;
  81
  82     /** String description of the version of the index. */
  83     public String segmentFormat;
  84
  85     /** Empty unless you passed specific segments list to check as optional 3rd argument.
  86      *  @see CheckIndex#checkIndex(List) */
  87     public List<String> segmentsChecked = new ArrayList<String>();
  88
  89     /** True if the index was created with a newer version of Lucene than the CheckIndex tool. */
  90     public boolean toolOutOfDate;
  91
  92     /** List of {@link SegmentInfoStatus} instances, detailing status of each segment. */
  93     public List<SegmentInfoStatus> segmentInfos = new ArrayList<SegmentInfoStatus>();
  94
  95     /** Directory index is in. */
  96     public Directory dir;
  97
  98     /**
  99      * SegmentInfos instance containing only segments that
 100      * had no problems (this is used with the {@link CheckIndex#fixIndex}
 101      * method to repair the index.
 102      */
 103     SegmentInfos newSegments;
 104
 105     /** How many documents will be lost to bad segments. */
 106     public int totLoseDocCount;
 107
 108     /** How many bad segments were found. */
 109     public int numBadSegments;
 110
 111     /** True if we checked only specific segments ({@link
 112      * #checkIndex(List)}) was called with non-null
 113      * argument). */
 114     public boolean partial;
 115
 116     /** The greatest segment name. */
 117     public int maxSegmentName;
 118
 119     /** Whether the SegmentInfos.counter is greater than any of the segments' names. */
 120     public boolean validCounter;
 121
 122     /** Holds the userData of the last commit in the index */
 123     public Map<String, String> userData;
 124
 125     /** Holds the status of each segment in the index.
 126      *  See {@link #segmentInfos}.
 127      *
 128      * <p><b>WARNING</b>: this API is new and experimental and is
 129      * subject to suddenly change in the next release.
 130      */
 131     public static class SegmentInfoStatus {
 132       /** Name of the segment. */
 133       public String name;
 134
 135       /** Document count (does not take deletions into account). */
 136       public int docCount;
 137
 138       /** True if segment is compound file format. */
 139       public boolean compound;
 140
 141       /** Number of files referenced by this segment. */
 142       public int numFiles;
 143
 144       /** Net size (MB) of the files referenced by this
 145        *  segment. */
 146       public double sizeMB;
 147
 148       /** Doc store offset, if this segment shares the doc
 149        *  store files (stored fields and term vectors) with
 150        *  other segments.  This is -1 if it does not share. */
 151       public int docStoreOffset = -1;
 152
 153       /** String of the shared doc store segment, or null if
 154        *  this segment does not share the doc store files. */
 155       public String docStoreSegment;
 156
 157       /** True if the shared doc store files are compound file
 158        *  format. */
 159       public boolean docStoreCompoundFile;
 160
 161       /** True if this segment has pending deletions. */
 162       public boolean hasDeletions;
 163
 164       /** Name of the current deletions file name. */
 165       public String deletionsFileName;
 166
 167       /** Number of deleted documents. */
 168       public int numDeleted;
 169
 170       /** True if we were able to open a SegmentReader on this
 171        *  segment. */
 172       public boolean openReaderPassed;
 173
 174       /** Number of fields in this segment. */
 175       int numFields;
 176
 177       /** True if at least one of the fields in this segment
 178        *  has position data
 179        *  @see AbstractField#setIndexOptions(org.apache.lucene.index.FieldInfo.IndexOptions) */
 180       public boolean hasProx;
 181
 182       /** Map that includes certain
 183        *  debugging details that IndexWriter records into
 184        *  each segment it creates */
 185       public Map<String,String> diagnostics;
 186
 187       /** Status for testing of field norms (null if field norms could not be tested). */
 188       public FieldNormStatus fieldNormStatus;
 189
 190       /** Status for testing of indexed terms (null if indexed terms could not be tested). */
 191       public TermIndexStatus termIndexStatus;
 192
 193       /** Status for testing of stored fields (null if stored fields could not be tested). */
 194       public StoredFieldStatus storedFieldStatus;
 195
 196       /** Status for testing of term vectors (null if term vectors could not be tested). */
 197       public TermVectorStatus termVectorStatus;
 198     }
 199
 200     /**
 201      * Status from testing field norms.
 202      */
 203     public static final class FieldNormStatus {
 204       /** Number of fields successfully tested */
 205       public long totFields = 0L;
 206
 207       /** Exception thrown during term index test (null on success) */
 208       public Throwable error = null;
 209     }
 210
 211     /**
 212      * Status from testing term index.
 213      */
 214     public static final class TermIndexStatus {
 215       /** Total term count */
 216       public long termCount = 0L;
 217
 218       /** Total frequency across all terms. */
 219       public long totFreq = 0L;
 220
 221       /** Total number of positions. */
 222       public long totPos = 0L;
 223
 224       /** Exception thrown during term index test (null on success) */
 225       public Throwable error = null;
 226     }
 227
 228     /**
 229      * Status from testing stored fields.
 230      */
 231     public static final class StoredFieldStatus {
 232
 233       /** Number of documents tested. */
 234       public int docCount = 0;
 235
 236       /** Total number of stored fields tested. */
 237       public long totFields = 0;
 238
 239       /** Exception thrown during stored fields test (null on success) */
 240       public Throwable error = null;
 241     }
 242
 243     /**
 244      * Status from testing stored fields.
 245      */
 246     public static final class TermVectorStatus {
 247
 248       /** Number of documents tested. */
 249       public int docCount = 0;
 250
 251       /** Total number of term vectors tested. */
 252       public long totVectors = 0;
 253
 254       /** Exception thrown during term vector test (null on success) */
 255       public Throwable error = null;
 256     }
 257   }
 258
 259   /** Create a new CheckIndex on the directory. */
 260   public CheckIndex(Directory dir) {
 261     this.dir = dir;
 262     infoStream = null;
 263   }
 264
 265   /** Set infoStream where messages should go.  If null, no
 266    *  messages are printed */
 267   public void setInfoStream(PrintStream out) {
 268     infoStream = out;
 269   }
 270
 271   private void msg(String msg) {
 272     if (infoStream != null)
 273       infoStream.println(msg);
 274   }
 275
 276   private static class MySegmentTermDocs extends SegmentTermDocs {
 277
 278     int delCount;
 279
 280     MySegmentTermDocs(SegmentReader p) {
 281       super(p);
 282     }
 283
 284     @Override
 285     public void seek(Term term) throws IOException {
 286       super.seek(term);
 287       delCount = 0;
 288     }
 289
 290     @Override
 291     protected void skippingDoc() throws IOException {
 292       delCount++;
 293     }
 294   }
 295
 296   /** Returns a {@link Status} instance detailing
 297    *  the state of the index.
 298    *
 299    *  <p>As this method checks every byte in the index, on a large
 300    *  index it can take quite a long time to run.
 301    *
 302    *  <p><b>WARNING</b>: make sure
 303    *  you only call this when the index is not opened by any
 304    *  writer. */
 305   public Status checkIndex() throws IOException {
 306     return checkIndex(null);
 307   }
 308
 309   /** Returns a {@link Status} instance detailing
 310    *  the state of the index.
 311    *
 312    *  @param onlySegments list of specific segment names to check
 313    *
 314    *  <p>As this method checks every byte in the specified
 315    *  segments, on a large index it can take quite a long
 316    *  time to run.
 317    *
 318    *  <p><b>WARNING</b>: make sure
 319    *  you only call this when the index is not opened by any
 320    *  writer. */
 321   public Status checkIndex(List<String> onlySegments) throws IOException {
 322     NumberFormat nf = NumberFormat.getInstance();
 323     SegmentInfos sis = new SegmentInfos();
 324     Status result = new Status();
 325     result.dir = dir;
 326     try {
 327       sis.read(dir);
 328     } catch (Throwable t) {
 329       msg("ERROR: could not read any segments file in directory");
 330       result.missingSegments = true;
 331       if (infoStream != null)
 332         t.printStackTrace(infoStream);
 333       return result;
 334     }
 335
 336     // find the oldest and newest segment versions
 337     String oldest = Integer.toString(Integer.MAX_VALUE), newest = Integer.toString(Integer.MIN_VALUE);
 338     String oldSegs = null;
 339     boolean foundNonNullVersion = false;
 340     Comparator<String> versionComparator = StringHelper.getVersionComparator();
 341     for (SegmentInfo si : sis) {
 342       String version = si.getVersion();
 343       if (version == null) {
 344         // pre-3.1 segment
 345         oldSegs = "pre-3.1";
 346       } else if (version.equals("2.x")) {
 347         // an old segment that was 'touched' by 3.1+ code
 348         oldSegs = "2.x";
 349       } else {
 350         foundNonNullVersion = true;
 351         if (versionComparator.compare(version, oldest) < 0) {
 352           oldest = version;
 353         }
 354         if (versionComparator.compare(version, newest) > 0) {
 355           newest = version;
 356         }
 357       }
 358     }
 359
 360     final int numSegments = sis.size();
 361     final String segmentsFileName = sis.getCurrentSegmentFileName();
 362     IndexInput input = null;
 363     try {
 364       input = dir.openInput(segmentsFileName);
 365     } catch (Throwable t) {
 366       msg("ERROR: could not open segments file in directory");
 367       if (infoStream != null)
 368         t.printStackTrace(infoStream);
 369       result.cantOpenSegments = true;
 370       return result;
 371     }
 372     int format = 0;
 373     try {
 374       format = input.readInt();
 375     } catch (Throwable t) {
 376       msg("ERROR: could not read segment file version in directory");
 377       if (infoStream != null)
 378         t.printStackTrace(infoStream);
 379       result.missingSegmentVersion = true;
 380       return result;
 381     } finally {
 382       if (input != null)
 383         input.close();
 384     }
 385
 386     String sFormat = "";
 387     boolean skip = false;
 388
 389     if (format == SegmentInfos.FORMAT)
 390       sFormat = "FORMAT [Lucene Pre-2.1]";
 391     if (format == SegmentInfos.FORMAT_LOCKLESS)
 392       sFormat = "FORMAT_LOCKLESS [Lucene 2.1]";
 393     else if (format == SegmentInfos.FORMAT_SINGLE_NORM_FILE)
 394       sFormat = "FORMAT_SINGLE_NORM_FILE [Lucene 2.2]";
 395     else if (format == SegmentInfos.FORMAT_SHARED_DOC_STORE)
 396       sFormat = "FORMAT_SHARED_DOC_STORE [Lucene 2.3]";
 397     else {
 398       if (format == SegmentInfos.FORMAT_CHECKSUM)
 399         sFormat = "FORMAT_CHECKSUM [Lucene 2.4]";
 400       else if (format == SegmentInfos.FORMAT_DEL_COUNT)
 401         sFormat = "FORMAT_DEL_COUNT [Lucene 2.4]";
 402       else if (format == SegmentInfos.FORMAT_HAS_PROX)
 403         sFormat = "FORMAT_HAS_PROX [Lucene 2.4]";
 404       else if (format == SegmentInfos.FORMAT_USER_DATA)
 405         sFormat = "FORMAT_USER_DATA [Lucene 2.9]";
 406       else if (format == SegmentInfos.FORMAT_DIAGNOSTICS)
 407         sFormat = "FORMAT_DIAGNOSTICS [Lucene 2.9]";
 408       else if (format == SegmentInfos.FORMAT_HAS_VECTORS)
 409         sFormat = "FORMAT_HAS_VECTORS [Lucene 3.1]";
 410       else if (format == SegmentInfos.FORMAT_3_1)
 411         sFormat = "FORMAT_3_1 [Lucene 3.1+]";
 412       else if (format == SegmentInfos.CURRENT_FORMAT)
 413         throw new RuntimeException("BUG: You should update this tool!");
 414       else if (format < SegmentInfos.CURRENT_FORMAT) {
 415         sFormat = "int=" + format + " [newer version of Lucene than this tool]";
 416         skip = true;
 417       } else {
 418         sFormat = format + " [Lucene 1.3 or prior]";
 419       }
 420     }
 421
 422     result.segmentsFileName = segmentsFileName;
 423     result.numSegments = numSegments;
 424     result.segmentFormat = sFormat;
 425     result.userData = sis.getUserData();
 426     String userDataString;
 427     if (sis.getUserData().size() > 0) {
 428       userDataString = " userData=" + sis.getUserData();
 429     } else {
 430       userDataString = "";
 431     }
 432
 433     String versionString = null;
 434     if (oldSegs != null) {
 435       if (foundNonNullVersion) {
 436         versionString = "versions=[" + oldSegs + " .. " + newest + "]";
 437       } else {
 438         versionString = "version=" + oldSegs;
 439       }
 440     } else {
 441       versionString = oldest.equals(newest) ? ( "version=" + oldest ) : ("versions=[" + oldest + " .. " + newest + "]");
 442     }
 443
 444     msg("Segments file=" + segmentsFileName + " numSegments=" + numSegments
 445         + " " + versionString + " format=" + sFormat + userDataString);
 446
 447     if (onlySegments != null) {
 448       result.partial = true;
 449       if (infoStream != null)
 450         infoStream.print("\nChecking only these segments:");
 451       for (String s : onlySegments) {
 452         if (infoStream != null)
 453           infoStream.print(" " + s);
 454       }
 455       result.segmentsChecked.addAll(onlySegments);
 456       msg(":");
 457     }
 458
 459     if (skip) {
 460       msg("\nERROR: this index appears to be created by a newer version of Lucene than this tool was compiled on; please re-compile this tool on the matching version of Lucene; exiting");
 461       result.toolOutOfDate = true;
 462       return result;
 463     }
 464
 465
 466     result.newSegments = (SegmentInfos) sis.clone();
 467     result.newSegments.clear();
 468     result.maxSegmentName = -1;
 469
 470     for(int i=0;i<numSegments;i++) {
 471       final SegmentInfo info = sis.info(i);
 472       int segmentName = Integer.parseInt(info.name.substring(1), Character.MAX_RADIX);
 473       if (segmentName > result.maxSegmentName) {
 474         result.maxSegmentName = segmentName;
 475       }
 476       if (onlySegments != null && !onlySegments.contains(info.name))
 477         continue;
 478       Status.SegmentInfoStatus segInfoStat = new Status.SegmentInfoStatus();
 479       result.segmentInfos.add(segInfoStat);
 480       msg("  " + (1+i) + " of " + numSegments + ": name=" + info.name + " docCount=" + info.docCount);
 481       segInfoStat.name = info.name;
 482       segInfoStat.docCount = info.docCount;
 483
 484       int toLoseDocCount = info.docCount;
 485
 486       SegmentReader reader = null;
 487
 488       try {
 489         msg("    compound=" + info.getUseCompoundFile());
 490         segInfoStat.compound = info.getUseCompoundFile();
 491         msg("    hasProx=" + info.getHasProx());
 492         segInfoStat.hasProx = info.getHasProx();
 493         msg("    numFiles=" + info.files().size());
 494         segInfoStat.numFiles = info.files().size();
 495         segInfoStat.sizeMB = info.sizeInBytes(true)/(1024.*1024.);
 496         msg("    size (MB)=" + nf.format(segInfoStat.sizeMB));
 497         Map<String,String> diagnostics = info.getDiagnostics();
 498         segInfoStat.diagnostics = diagnostics;
 499         if (diagnostics.size() > 0) {
 500           msg("    diagnostics = " + diagnostics);
 501         }
 502
 503         final int docStoreOffset = info.getDocStoreOffset();
 504         if (docStoreOffset != -1) {
 505           msg("    docStoreOffset=" + docStoreOffset);
 506           segInfoStat.docStoreOffset = docStoreOffset;
 507           msg("    docStoreSegment=" + info.getDocStoreSegment());
 508           segInfoStat.docStoreSegment = info.getDocStoreSegment();
 509           msg("    docStoreIsCompoundFile=" + info.getDocStoreIsCompoundFile());
 510           segInfoStat.docStoreCompoundFile = info.getDocStoreIsCompoundFile();
 511         }
 512         final String delFileName = info.getDelFileName();
 513         if (delFileName == null){
 514           msg("    no deletions");
 515           segInfoStat.hasDeletions = false;
 516         }
 517         else{
 518           msg("    has deletions [delFileName=" + delFileName + "]");
 519           segInfoStat.hasDeletions = true;
 520           segInfoStat.deletionsFileName = delFileName;
 521         }
 522         if (infoStream != null)
 523           infoStream.print("    test: open reader.........");
 524         reader = SegmentReader.get(true, info, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR);
 525
 526         segInfoStat.openReaderPassed = true;
 527
 528         final int numDocs = reader.numDocs();
 529         toLoseDocCount = numDocs;
 530         if (reader.hasDeletions()) {
 531           if (reader.deletedDocs.count() != info.getDelCount()) {
 532             throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs deletedDocs.count()=" + reader.deletedDocs.count());
 533           }
 534           if (reader.deletedDocs.count() > reader.maxDoc()) {
 535             throw new RuntimeException("too many deleted docs: maxDoc()=" + reader.maxDoc() + " vs deletedDocs.count()=" + reader.deletedDocs.count());
 536           }
 537           if (info.docCount - numDocs != info.getDelCount()){
 538             throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs reader=" + (info.docCount - numDocs));
 539           }
 540           segInfoStat.numDeleted = info.docCount - numDocs;
 541           msg("OK [" + (segInfoStat.numDeleted) + " deleted docs]");
 542         } else {
 543           if (info.getDelCount() != 0) {
 544             throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs reader=" + (info.docCount - numDocs));
 545           }
 546           msg("OK");
 547         }
 548         if (reader.maxDoc() != info.docCount)
 549           throw new RuntimeException("SegmentReader.maxDoc() " + reader.maxDoc() + " != SegmentInfos.docCount " + info.docCount);
 550
 551         // Test getFieldNames()
 552         if (infoStream != null) {
 553           infoStream.print("    test: fields..............");
 554         }
 555         Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);
 556         msg("OK [" + fieldNames.size() + " fields]");
 557         segInfoStat.numFields = fieldNames.size();
 558
 559         // Test Field Norms
 560         segInfoStat.fieldNormStatus = testFieldNorms(fieldNames, reader);
 561
 562         // Test the Term Index
 563         segInfoStat.termIndexStatus = testTermIndex(info, reader);
 564
 565         // Test Stored Fields
 566         segInfoStat.storedFieldStatus = testStoredFields(info, reader, nf);
 567
 568         // Test Term Vectors
 569         segInfoStat.termVectorStatus = testTermVectors(info, reader, nf);
 570
 571         // Rethrow the first exception we encountered
 572         //  This will cause stats for failed segments to be incremented properly
 573         if (segInfoStat.fieldNormStatus.error != null) {
 574           throw new RuntimeException("Field Norm test failed");
 575         } else if (segInfoStat.termIndexStatus.error != null) {
 576           throw new RuntimeException("Term Index test failed");
 577         } else if (segInfoStat.storedFieldStatus.error != null) {
 578           throw new RuntimeException("Stored Field test failed");
 579         } else if (segInfoStat.termVectorStatus.error != null) {
 580           throw new RuntimeException("Term Vector test failed");
 581         }
 582
 583         msg("");
 584
 585       } catch (Throwable t) {
 586         msg("FAILED");
 587         String comment;
 588         comment = "fixIndex() would remove reference to this segment";
 589         msg("    WARNING: " + comment + "; full exception:");
 590         if (infoStream != null)
 591           t.printStackTrace(infoStream);
 592         msg("");
 593         result.totLoseDocCount += toLoseDocCount;
 594         result.numBadSegments++;
 595         continue;
 596       } finally {
 597         if (reader != null)
 598           reader.close();
 599       }
 600
 601       // Keeper
 602       result.newSegments.add((SegmentInfo) info.clone());
 603     }
 604
 605     if (0 == result.numBadSegments) {
 606       result.clean = true;
 607     } else
 608       msg("WARNING: " + result.numBadSegments + " broken segments (containing " + result.totLoseDocCount + " documents) detected");
 609
 610     if ( ! (result.validCounter = (result.maxSegmentName < sis.counter))) {
 611       result.clean = false;
 612       result.newSegments.counter = result.maxSegmentName + 1;
 613       msg("ERROR: Next segment name counter " + sis.counter + " is not greater than max segment name " + result.maxSegmentName);
 614     }
 615
 616     if (result.clean) {
 617       msg("No problems were detected with this index.\n");
 618     }
 619
 620     return result;
 621   }
 622
 623   /**
 624    * Test field norms.
 625    */
 626   private Status.FieldNormStatus testFieldNorms(Collection<String> fieldNames, SegmentReader reader) {
 627     final Status.FieldNormStatus status = new Status.FieldNormStatus();
 628
 629     try {
 630       // Test Field Norms
 631       if (infoStream != null) {
 632         infoStream.print("    test: field norms.........");
 633       }
 634       final byte[] b = new byte[reader.maxDoc()];
 635       for (final String fieldName : fieldNames) {
 636         if (reader.hasNorms(fieldName)) {
 637           reader.norms(fieldName, b, 0);
 638           ++status.totFields;
 639         }
 640       }
 641
 642       msg("OK [" + status.totFields + " fields]");
 643     } catch (Throwable e) {
 644       msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
 645       status.error = e;
 646       if (infoStream != null) {
 647         e.printStackTrace(infoStream);
 648       }
 649     }
 650
 651     return status;
 652   }
 653
 654   /**
 655    * Test the term index.
 656    */
 657   private Status.TermIndexStatus testTermIndex(SegmentInfo info, SegmentReader reader) {
 658     final Status.TermIndexStatus status = new Status.TermIndexStatus();
 659
 660     final IndexSearcher is = new IndexSearcher(reader);
 661
 662     try {
 663       if (infoStream != null) {
 664         infoStream.print("    test: terms, freq, prox...");
 665       }
 666
 667       final TermEnum termEnum = reader.terms();
 668       final TermPositions termPositions = reader.termPositions();
 669
 670       // Used only to count up # deleted docs for this term
 671       final MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader);
 672
 673       final int maxDoc = reader.maxDoc();
 674       Term lastTerm = null;
 675       while (termEnum.next()) {
 676         status.termCount++;
 677         final Term term = termEnum.term();
 678         lastTerm = term;
 679
 680         final int docFreq = termEnum.docFreq();
 681         termPositions.seek(term);
 682         int lastDoc = -1;
 683         int freq0 = 0;
 684         status.totFreq += docFreq;
 685         while (termPositions.next()) {
 686           freq0++;
 687           final int doc = termPositions.doc();
 688           final int freq = termPositions.freq();
 689           if (doc <= lastDoc)
 690             throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
 691           if (doc >= maxDoc)
 692             throw new RuntimeException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc);
 693
 694           lastDoc = doc;
 695           if (freq <= 0)
 696             throw new RuntimeException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds");
 697
 698           int lastPos = -1;
 699           status.totPos += freq;
 700           for(int j=0;j<freq;j++) {
 701             final int pos = termPositions.nextPosition();
 702             if (pos < -1)
 703               throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");
 704             if (pos < lastPos)
 705               throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos);
 706             lastPos = pos;
 707           }
 708         }
 709
 710         // Test skipping
 711         for(int idx=0;idx<7;idx++) {
 712           final int skipDocID = (int) (((idx+1)*(long) maxDoc)/8);
 713           termPositions.seek(term);
 714           if (!termPositions.skipTo(skipDocID)) {
 715             break;
 716           } else {
 717
 718             final int docID = termPositions.doc();
 719             if (docID < skipDocID) {
 720               throw new RuntimeException("term " + term + ": skipTo(docID=" + skipDocID + ") returned docID=" + docID);
 721             }
 722             final int freq = termPositions.freq();
 723             if (freq <= 0) {
 724               throw new RuntimeException("termFreq " + freq + " is out of bounds");
 725             }
 726             int lastPosition = -1;
 727             for(int posUpto=0;posUpto<freq;posUpto++) {
 728               final int pos = termPositions.nextPosition();
 729               if (pos < 0) {
 730                 throw new RuntimeException("position " + pos + " is out of bounds");
 731               }
 732               // TODO: we should assert when all pos == 0 that positions are actually omitted
 733               if (pos < lastPosition) {
 734                 throw new RuntimeException("position " + pos + " is < lastPosition " + lastPosition);
 735               }
 736               lastPosition = pos;
 737             }
 738
 739             if (!termPositions.next()) {
 740               break;
 741             }
 742             final int nextDocID = termPositions.doc();
 743             if (nextDocID <= docID) {
 744               throw new RuntimeException("term " + term + ": skipTo(docID=" + skipDocID + "), then .next() returned docID=" + nextDocID + " vs prev docID=" + docID);
 745             }
 746           }
 747         }
 748
 749         // Now count how many deleted docs occurred in
 750         // this term:
 751         final int delCount;
 752         if (reader.hasDeletions()) {
 753           myTermDocs.seek(term);
 754           while(myTermDocs.next()) { }
 755           delCount = myTermDocs.delCount;
 756         } else {
 757           delCount = 0;
 758         }
 759
 760         if (freq0 + delCount != docFreq) {
 761           throw new RuntimeException("term " + term + " docFreq=" +
 762                                      docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount);
 763         }
 764       }
 765
 766       // Test search on last term:
 767       if (lastTerm != null) {
 768         is.search(new TermQuery(lastTerm), 1);
 769       }
 770
 771       msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]");
 772
 773     } catch (Throwable e) {
 774       msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
 775       status.error = e;
 776       if (infoStream != null) {
 777         e.printStackTrace(infoStream);
 778       }
 779     }
 780
 781     return status;
 782   }
 783
 784   /**
 785    * Test stored fields for a segment.
 786    */
 787   private Status.StoredFieldStatus testStoredFields(SegmentInfo info, SegmentReader reader, NumberFormat format) {
 788     final Status.StoredFieldStatus status = new Status.StoredFieldStatus();
 789
 790     try {
 791       if (infoStream != null) {
 792         infoStream.print("    test: stored fields.......");
 793       }
 794
 795       // Scan stored fields for all documents
 796       for (int j = 0; j < info.docCount; ++j) {
 797         if (!reader.isDeleted(j)) {
 798           status.docCount++;
 799           Document doc = reader.document(j);
 800           status.totFields += doc.getFields().size();
 801         }
 802       }
 803
 804       // Validate docCount
 805       if (status.docCount != reader.numDocs()) {
 806         throw new RuntimeException("docCount=" + status.docCount + " but saw " + status.docCount + " undeleted docs");
 807       }
 808
 809       msg("OK [" + status.totFields + " total field count; avg " +
 810           format.format((((float) status.totFields)/status.docCount)) + " fields per doc]");
 811     } catch (Throwable e) {
 812       msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
 813       status.error = e;
 814       if (infoStream != null) {
 815         e.printStackTrace(infoStream);
 816       }
 817     }
 818
 819     return status;
 820   }
 821
 822   /**
 823    * Test term vectors for a segment.
 824    */
 825   private Status.TermVectorStatus testTermVectors(SegmentInfo info, SegmentReader reader, NumberFormat format) {
 826     final Status.TermVectorStatus status = new Status.TermVectorStatus();
 827
 828     try {
 829       if (infoStream != null) {
 830         infoStream.print("    test: term vectors........");
 831       }
 832
 833       for (int j = 0; j < info.docCount; ++j) {
 834         if (!reader.isDeleted(j)) {
 835           status.docCount++;
 836           TermFreqVector[] tfv = reader.getTermFreqVectors(j);
 837           if (tfv != null) {
 838             status.totVectors += tfv.length;
 839           }
 840         }
 841       }
 842
 843       msg("OK [" + status.totVectors + " total vector count; avg " +
 844           format.format((((float) status.totVectors) / status.docCount)) + " term/freq vector fields per doc]");
 845     } catch (Throwable e) {
 846       msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
 847       status.error = e;
 848       if (infoStream != null) {
 849         e.printStackTrace(infoStream);
 850       }
 851     }
 852
 853     return status;
 854   }
 855
 856   /** Repairs the index using previously returned result
 857    *  from {@link #checkIndex}.  Note that this does not
 858    *  remove any of the unreferenced files after it's done;
 859    *  you must separately open an {@link IndexWriter}, which
 860    *  deletes unreferenced files when it's created.
 861    *
 862    * <p><b>WARNING</b>: this writes a
 863    *  new segments file into the index, effectively removing
 864    *  all documents in broken segments from the index.
 865    *  BE CAREFUL.
 866    *
 867    * <p><b>WARNING</b>: Make sure you only call this when the
 868    *  index is not opened  by any writer. */
 869   public void fixIndex(Status result) throws IOException {
 870     if (result.partial)
 871       throw new IllegalArgumentException("can only fix an index that was fully checked (this status checked a subset of segments)");
 872     result.newSegments.changed();
 873     result.newSegments.commit(result.dir);
 874   }
 875
 876   private static boolean assertsOn;
 877
 878   private static boolean testAsserts() {
 879     assertsOn = true;
 880     return true;
 881   }
 882
 883   private static boolean assertsOn() {
 884     assert testAsserts();
 885     return assertsOn;
 886   }
 887
 888   /** Command-line interface to check and fix an index.
 889
 890     <p>
 891     Run it like this:
 892     <pre>
 893     java -ea:org.apache.lucene... org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-segment X] [-segment Y]
 894     </pre>
 895     <ul>
 896     <li><code>-fix</code>: actually write a new segments_N file, removing any problematic segments
 897
 898     <li><code>-segment X</code>: only check the specified
 899     segment(s).  This can be specified multiple times,
 900     to check more than one segment, eg <code>-segment _2
 901     -segment _a</code>.  You can't use this with the -fix
 902     option.
 903     </ul>
 904
 905     <p><b>WARNING</b>: <code>-fix</code> should only be used on an emergency basis as it will cause
 906                        documents (perhaps many) to be permanently removed from the index.  Always make
 907                        a backup copy of your index before running this!  Do not run this tool on an index
 908                        that is actively being written to.  You have been warned!
 909
 910     <p>                Run without -fix, this tool will open the index, report version information
 911                        and report any exceptions it hits and what action it would take if -fix were
 912                        specified.  With -fix, this tool will remove any segments that have issues and
 913                        write a new segments_N file.  This means all documents contained in the affected
 914                        segments will be removed.
 915
 916     <p>
 917                        This tool exits with exit code 1 if the index cannot be opened or has any
 918                        corruption, else 0.
 919    */
 920   public static void main(String[] args) throws IOException, InterruptedException {
 921
 922     boolean doFix = false;
 923     List<String> onlySegments = new ArrayList<String>();
 924     String indexPath = null;
 925     int i = 0;
 926     while(i < args.length) {
 927       if (args[i].equals("-fix")) {
 928         doFix = true;
 929         i++;
 930       } else if (args[i].equals("-segment")) {
 931         if (i == args.length-1) {
 932           System.out.println("ERROR: missing name for -segment option");
 933           System.exit(1);
 934         }
 935         onlySegments.add(args[i+1]);
 936         i += 2;
 937       } else {
 938         if (indexPath != null) {
 939           System.out.println("ERROR: unexpected extra argument '" + args[i] + "'");
 940           System.exit(1);
 941         }
 942         indexPath = args[i];
 943         i++;
 944       }
 945     }
 946
 947     if (indexPath == null) {
 948       System.out.println("\nERROR: index path not specified");
 949       System.out.println("\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-segment X] [-segment Y]\n" +
 950                          "\n" +
 951                          "  -fix: actually write a new segments_N file, removing any problematic segments\n" +
 952                          "  -segment X: only check the specified segments.  This can be specified multiple\n" +
 953                          "              times, to check more than one segment, eg '-segment _2 -segment _a'.\n" +
 954                          "              You can't use this with the -fix option\n" +
 955                          "\n" +
 956                          "**WARNING**: -fix should only be used on an emergency basis as it will cause\n" +
 957                          "documents (perhaps many) to be permanently removed from the index.  Always make\n" +
 958                          "a backup copy of your index before running this!  Do not run this tool on an index\n" +
 959                          "that is actively being written to.  You have been warned!\n" +
 960                          "\n" +
 961                          "Run without -fix, this tool will open the index, report version information\n" +
 962                          "and report any exceptions it hits and what action it would take if -fix were\n" +
 963                          "specified.  With -fix, this tool will remove any segments that have issues and\n" +
 964                          "write a new segments_N file.  This means all documents contained in the affected\n" +
 965                          "segments will be removed.\n" +
 966                          "\n" +
 967                          "This tool exits with exit code 1 if the index cannot be opened or has any\n" +
 968                          "corruption, else 0.\n");
 969       System.exit(1);
 970     }
 971
 972     if (!assertsOn())
 973       System.out.println("\nNOTE: testing will be more thorough if you run java with '-ea:org.apache.lucene...', so assertions are enabled");
 974
 975     if (onlySegments.size() == 0)
 976       onlySegments = null;
 977     else if (doFix) {
 978       System.out.println("ERROR: cannot specify both -fix and -segment");
 979       System.exit(1);
 980     }
 981
 982     System.out.println("\nOpening index @ " + indexPath + "\n");
 983     Directory dir = null;
 984     try {
 985       dir = FSDirectory.open(new File(indexPath));
 986     } catch (Throwable t) {
 987       System.out.println("ERROR: could not open directory \"" + indexPath + "\"; exiting");
 988       t.printStackTrace(System.out);
 989       System.exit(1);
 990     }
 991
 992     CheckIndex checker = new CheckIndex(dir);
 993     checker.setInfoStream(System.out);
 994
 995     Status result = checker.checkIndex(onlySegments);
 996     if (result.missingSegments) {
 997       System.exit(1);
 998     }
 999
1000     if (!result.clean) {
1001       if (!doFix) {
1002         System.out.println("WARNING: would write new segments file, and " + result.totLoseDocCount + " documents would be lost, if -fix were specified\n");
1003       } else {
1004         System.out.println("WARNING: " + result.totLoseDocCount + " documents will be lost\n");
1005         System.out.println("NOTE: will write new segments file in 5 seconds; this will remove " + result.totLoseDocCount + " docs from the index. THIS IS YOUR LAST CHANCE TO CTRL+C!");
1006         for(int s=0;s<5;s++) {
1007           Thread.sleep(1000);
1008           System.out.println("  " + (5-s) + "...");
1009         }
1010         System.out.println("Writing...");
1011         checker.fixIndex(result);
1012         System.out.println("OK");
1013         System.out.println("Wrote new segments file \"" + result.newSegments.getCurrentSegmentFileName() + "\"");
1014       }
1015     }
1016     System.out.println("");
1017
1018     final int exitCode;
1019     if (result.clean == true)
1020       exitCode = 0;
1021     else
1022       exitCode = 1;
1023     System.exit(exitCode);
1024   }
1025 }