lucene-java-3.4.0/lucene/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java

   1 package org.apache.lucene.index;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.File;
  21 import java.io.IOException;
  22 import java.util.ArrayList;
  23
  24 import org.apache.lucene.analysis.WhitespaceAnalyzer;
  25 import org.apache.lucene.index.IndexWriter; // javadoc
  26 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
  27 import org.apache.lucene.store.Directory;
  28 import org.apache.lucene.store.FSDirectory;
  29 import org.apache.lucene.util.FixedBitSet;
  30 import org.apache.lucene.util.Version;
  31
  32 /**
  33  * This tool splits input index into multiple equal parts. The method employed
  34  * here uses {@link IndexWriter#addIndexes(IndexReader[])} where the input data
  35  * comes from the input index with artificially applied deletes to the document
  36  * id-s that fall outside the selected partition.
  37  * <p>Note 1: Deletes are only applied to a buffered list of deleted docs and
  38  * don't affect the source index - this tool works also with read-only indexes.
  39  * <p>Note 2: the disadvantage of this tool is that source index needs to be
  40  * read as many times as there are parts to be created, hence the name of this
  41  * tool.
  42  *
  43  * <p><b>NOTE</b>: this tool is unaware of documents added
  44  * atomically via {@link IndexWriter#addDocuments} or {@link
  45  * IndexWriter#updateDocuments}, which means it can easily
  46  * break up such document groups.
  47  */
  48 public class MultiPassIndexSplitter {
  49   /**
  50    * Split source index into multiple parts.
  51    * @param input source index, can be read-only, can have deletions, can have
  52    * multiple segments (or multiple readers).
  53    * @param outputs list of directories where the output parts will be stored.
  54    * @param seq if true, then the source index will be split into equal
  55    * increasing ranges of document id-s. If false, source document id-s will be
  56    * assigned in a deterministic round-robin fashion to one of the output splits.
  57    * @throws IOException
  58    * @deprecated use {@link #split(Version, IndexReader, Directory[], boolean)} instead.
  59    *             This method will be removed in Lucene 4.0.
  60    */
  61   @Deprecated
  62   public void split(IndexReader input, Directory[] outputs, boolean seq) throws IOException {
  63     split(Version.LUCENE_CURRENT, input, outputs, seq);
  64   }
  65
  66   /**
  67    * Split source index into multiple parts.
  68    * @param input source index, can be read-only, can have deletions, can have
  69    * multiple segments (or multiple readers).
  70    * @param outputs list of directories where the output parts will be stored.
  71    * @param seq if true, then the source index will be split into equal
  72    * increasing ranges of document id-s. If false, source document id-s will be
  73    * assigned in a deterministic round-robin fashion to one of the output splits.
  74    * @throws IOException
  75    */
  76   public void split(Version version, IndexReader input, Directory[] outputs, boolean seq) throws IOException {
  77     if (outputs == null || outputs.length < 2) {
  78       throw new IOException("Invalid number of outputs.");
  79     }
  80     if (input == null || input.numDocs() < 2) {
  81       throw new IOException("Not enough documents for splitting");
  82     }
  83     int numParts = outputs.length;
  84     // wrap a potentially read-only input
  85     // this way we don't have to preserve original deletions because neither
  86     // deleteDocument(int) or undeleteAll() is applied to the wrapped input index.
  87     input = new FakeDeleteIndexReader(input);
  88     int maxDoc = input.maxDoc();
  89     int partLen = maxDoc / numParts;
  90     for (int i = 0; i < numParts; i++) {
  91       input.undeleteAll();
  92       if (seq) { // sequential range
  93         int lo = partLen * i;
  94         int hi = lo + partLen;
  95         // below range
  96         for (int j = 0; j < lo; j++) {
  97           input.deleteDocument(j);
  98         }
  99         // above range - last part collects all id-s that remained due to
 100         // integer rounding errors
 101         if (i < numParts - 1) {
 102           for (int j = hi; j < maxDoc; j++) {
 103             input.deleteDocument(j);
 104           }
 105         }
 106       } else {
 107         // round-robin
 108         for (int j = 0; j < maxDoc; j++) {
 109           if ((j + numParts - i) % numParts != 0) {
 110             input.deleteDocument(j);
 111           }
 112         }
 113       }
 114       IndexWriter w = new IndexWriter(outputs[i], new IndexWriterConfig(
 115           version,
 116           new WhitespaceAnalyzer(Version.LUCENE_CURRENT))
 117           .setOpenMode(OpenMode.CREATE));
 118       System.err.println("Writing part " + (i + 1) + " ...");
 119       w.addIndexes(new IndexReader[]{input});
 120       w.close();
 121     }
 122     System.err.println("Done.");
 123   }
 124
 125   @SuppressWarnings("deprecation")
 126   public static void main(String[] args) throws Exception {
 127     if (args.length < 5) {
 128       System.err.println("Usage: MultiPassIndexSplitter -out <outputDir> -num <numParts> [-seq] <inputIndex1> [<inputIndex2 ...]");
 129       System.err.println("\tinputIndex\tpath to input index, multiple values are ok");
 130       System.err.println("\t-out ouputDir\tpath to output directory to contain partial indexes");
 131       System.err.println("\t-num numParts\tnumber of parts to produce");
 132       System.err.println("\t-seq\tsequential docid-range split (default is round-robin)");
 133       System.exit(-1);
 134     }
 135     ArrayList<IndexReader> indexes = new ArrayList<IndexReader>();
 136     String outDir = null;
 137     int numParts = -1;
 138     boolean seq = false;
 139     for (int i = 0; i < args.length; i++) {
 140       if (args[i].equals("-out")) {
 141         outDir = args[++i];
 142       } else if (args[i].equals("-num")) {
 143         numParts = Integer.parseInt(args[++i]);
 144       } else if (args[i].equals("-seq")) {
 145         seq = true;
 146       } else {
 147         File file = new File(args[i]);
 148         if (!file.exists() || !file.isDirectory()) {
 149           System.err.println("Invalid input path - skipping: " + file);
 150           continue;
 151         }
 152         Directory dir = FSDirectory.open(new File(args[i]));
 153         try {
 154           if (!IndexReader.indexExists(dir)) {
 155             System.err.println("Invalid input index - skipping: " + file);
 156             continue;
 157           }
 158         } catch (Exception e) {
 159           System.err.println("Invalid input index - skipping: " + file);
 160           continue;
 161         }
 162         indexes.add(IndexReader.open(dir, true));
 163       }
 164     }
 165     if (outDir == null) {
 166       throw new Exception("Required argument missing: -out outputDir");
 167     }
 168     if (numParts < 2) {
 169       throw new Exception("Invalid value of required argument: -num numParts");
 170     }
 171     if (indexes.size() == 0) {
 172       throw new Exception("No input indexes to process");
 173     }
 174     File out = new File(outDir);
 175     if (!out.mkdirs()) {
 176       throw new Exception("Can't create output directory: " + out);
 177     }
 178     Directory[] dirs = new Directory[numParts];
 179     for (int i = 0; i < numParts; i++) {
 180       dirs[i] = FSDirectory.open(new File(out, "part-" + i));
 181     }
 182     MultiPassIndexSplitter splitter = new MultiPassIndexSplitter();
 183     IndexReader input;
 184     if (indexes.size() == 1) {
 185       input = indexes.get(0);
 186     } else {
 187       input = new MultiReader(indexes.toArray(new IndexReader[indexes.size()]));
 188     }
 189     splitter.split(Version.LUCENE_CURRENT, input, dirs, seq);
 190   }
 191
 192   /**
 193    * This class pretends that it can write deletions to the underlying index.
 194    * Instead, deletions are buffered in a bitset and overlaid with the original
 195    * list of deletions.
 196    */
 197   public static class FakeDeleteIndexReader extends FilterIndexReader {
 198     FixedBitSet dels;
 199     FixedBitSet oldDels;
 200
 201     public FakeDeleteIndexReader(IndexReader in) {
 202       super(in);
 203       dels = new FixedBitSet(in.maxDoc());
 204       if (in.hasDeletions()) {
 205         oldDels = new FixedBitSet(in.maxDoc());
 206         for (int i = 0; i < in.maxDoc(); i++) {
 207           if (in.isDeleted(i)) oldDels.set(i);
 208         }
 209         dels.or(oldDels);
 210       }
 211     }
 212
 213     @Override
 214     public int numDocs() {
 215       return in.maxDoc() - dels.cardinality();
 216     }
 217
 218     /**
 219      * Just removes our overlaid deletions - does not undelete the original
 220      * deletions.
 221      */
 222     @Override
 223     protected void doUndeleteAll() throws CorruptIndexException, IOException {
 224       dels = new FixedBitSet(in.maxDoc());
 225       if (oldDels != null) {
 226         dels.or(oldDels);
 227       }
 228     }
 229
 230     @Override
 231     protected void doDelete(int n) throws CorruptIndexException, IOException {
 232       dels.set(n);
 233     }
 234
 235     @Override
 236     public boolean hasDeletions() {
 237       return in.maxDoc() != this.numDocs();
 238     }
 239
 240     @Override
 241     public boolean isDeleted(int n) {
 242       return dels.get(n);
 243     }
 244
 245     @Override
 246     public IndexReader[] getSequentialSubReaders() {
 247       return null;
 248     }
 249
 250     @Override
 251     public TermPositions termPositions() throws IOException {
 252       return new FilterTermPositions(in.termPositions()) {
 253
 254         @Override
 255         public boolean next() throws IOException {
 256           boolean res;
 257           while ((res = super.next())) {
 258             if (!dels.get(doc())) {
 259               break;
 260             }
 261           }
 262           return res;
 263         }
 264       };
 265     }
 266   }
 267 }