1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import java.io.IOException;
22 import java.util.ArrayList;
24 import org.apache.lucene.analysis.WhitespaceAnalyzer;
25 import org.apache.lucene.index.IndexWriter; // javadoc
26 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
27 import org.apache.lucene.store.Directory;
28 import org.apache.lucene.store.FSDirectory;
29 import org.apache.lucene.util.FixedBitSet;
30 import org.apache.lucene.util.Version;
33 * This tool splits input index into multiple equal parts. The method employed
34 * here uses {@link IndexWriter#addIndexes(IndexReader[])} where the input data
35 * comes from the input index with artificially applied deletes to the document
36 * id-s that fall outside the selected partition.
37 * <p>Note 1: Deletes are only applied to a buffered list of deleted docs and
38 * don't affect the source index - this tool works also with read-only indexes.
39 * <p>Note 2: the disadvantage of this tool is that source index needs to be
40 * read as many times as there are parts to be created, hence the name of this
43 * <p><b>NOTE</b>: this tool is unaware of documents added
44 * atomically via {@link IndexWriter#addDocuments} or {@link
45 * IndexWriter#updateDocuments}, which means it can easily
46 * break up such document groups.
48 public class MultiPassIndexSplitter {
50 * Split source index into multiple parts.
51 * @param input source index, can be read-only, can have deletions, can have
52 * multiple segments (or multiple readers).
53 * @param outputs list of directories where the output parts will be stored.
54 * @param seq if true, then the source index will be split into equal
55 * increasing ranges of document id-s. If false, source document id-s will be
56 * assigned in a deterministic round-robin fashion to one of the output splits.
58 * @deprecated use {@link #split(Version, IndexReader, Directory[], boolean)} instead.
59 * This method will be removed in Lucene 4.0.
62 public void split(IndexReader input, Directory[] outputs, boolean seq) throws IOException {
63 split(Version.LUCENE_CURRENT, input, outputs, seq);
67 * Split source index into multiple parts.
68 * @param input source index, can be read-only, can have deletions, can have
69 * multiple segments (or multiple readers).
70 * @param outputs list of directories where the output parts will be stored.
71 * @param seq if true, then the source index will be split into equal
72 * increasing ranges of document id-s. If false, source document id-s will be
73 * assigned in a deterministic round-robin fashion to one of the output splits.
76 public void split(Version version, IndexReader input, Directory[] outputs, boolean seq) throws IOException {
77 if (outputs == null || outputs.length < 2) {
78 throw new IOException("Invalid number of outputs.");
80 if (input == null || input.numDocs() < 2) {
81 throw new IOException("Not enough documents for splitting");
83 int numParts = outputs.length;
84 // wrap a potentially read-only input
85 // this way we don't have to preserve original deletions because neither
86 // deleteDocument(int) or undeleteAll() is applied to the wrapped input index.
87 input = new FakeDeleteIndexReader(input);
88 int maxDoc = input.maxDoc();
89 int partLen = maxDoc / numParts;
90 for (int i = 0; i < numParts; i++) {
92 if (seq) { // sequential range
94 int hi = lo + partLen;
96 for (int j = 0; j < lo; j++) {
97 input.deleteDocument(j);
99 // above range - last part collects all id-s that remained due to
100 // integer rounding errors
101 if (i < numParts - 1) {
102 for (int j = hi; j < maxDoc; j++) {
103 input.deleteDocument(j);
108 for (int j = 0; j < maxDoc; j++) {
109 if ((j + numParts - i) % numParts != 0) {
110 input.deleteDocument(j);
114 IndexWriter w = new IndexWriter(outputs[i], new IndexWriterConfig(
116 new WhitespaceAnalyzer(Version.LUCENE_CURRENT))
117 .setOpenMode(OpenMode.CREATE));
118 System.err.println("Writing part " + (i + 1) + " ...");
119 w.addIndexes(new IndexReader[]{input});
122 System.err.println("Done.");
125 @SuppressWarnings("deprecation")
126 public static void main(String[] args) throws Exception {
127 if (args.length < 5) {
128 System.err.println("Usage: MultiPassIndexSplitter -out <outputDir> -num <numParts> [-seq] <inputIndex1> [<inputIndex2 ...]");
129 System.err.println("\tinputIndex\tpath to input index, multiple values are ok");
130 System.err.println("\t-out ouputDir\tpath to output directory to contain partial indexes");
131 System.err.println("\t-num numParts\tnumber of parts to produce");
132 System.err.println("\t-seq\tsequential docid-range split (default is round-robin)");
135 ArrayList<IndexReader> indexes = new ArrayList<IndexReader>();
136 String outDir = null;
139 for (int i = 0; i < args.length; i++) {
140 if (args[i].equals("-out")) {
142 } else if (args[i].equals("-num")) {
143 numParts = Integer.parseInt(args[++i]);
144 } else if (args[i].equals("-seq")) {
147 File file = new File(args[i]);
148 if (!file.exists() || !file.isDirectory()) {
149 System.err.println("Invalid input path - skipping: " + file);
152 Directory dir = FSDirectory.open(new File(args[i]));
154 if (!IndexReader.indexExists(dir)) {
155 System.err.println("Invalid input index - skipping: " + file);
158 } catch (Exception e) {
159 System.err.println("Invalid input index - skipping: " + file);
162 indexes.add(IndexReader.open(dir, true));
165 if (outDir == null) {
166 throw new Exception("Required argument missing: -out outputDir");
169 throw new Exception("Invalid value of required argument: -num numParts");
171 if (indexes.size() == 0) {
172 throw new Exception("No input indexes to process");
174 File out = new File(outDir);
176 throw new Exception("Can't create output directory: " + out);
178 Directory[] dirs = new Directory[numParts];
179 for (int i = 0; i < numParts; i++) {
180 dirs[i] = FSDirectory.open(new File(out, "part-" + i));
182 MultiPassIndexSplitter splitter = new MultiPassIndexSplitter();
184 if (indexes.size() == 1) {
185 input = indexes.get(0);
187 input = new MultiReader(indexes.toArray(new IndexReader[indexes.size()]));
189 splitter.split(Version.LUCENE_CURRENT, input, dirs, seq);
193 * This class pretends that it can write deletions to the underlying index.
194 * Instead, deletions are buffered in a bitset and overlaid with the original
197 public static class FakeDeleteIndexReader extends FilterIndexReader {
201 public FakeDeleteIndexReader(IndexReader in) {
203 dels = new FixedBitSet(in.maxDoc());
204 if (in.hasDeletions()) {
205 oldDels = new FixedBitSet(in.maxDoc());
206 for (int i = 0; i < in.maxDoc(); i++) {
207 if (in.isDeleted(i)) oldDels.set(i);
214 public int numDocs() {
215 return in.maxDoc() - dels.cardinality();
219 * Just removes our overlaid deletions - does not undelete the original
223 protected void doUndeleteAll() throws CorruptIndexException, IOException {
224 dels = new FixedBitSet(in.maxDoc());
225 if (oldDels != null) {
231 protected void doDelete(int n) throws CorruptIndexException, IOException {
236 public boolean hasDeletions() {
237 return in.maxDoc() != this.numDocs();
241 public boolean isDeleted(int n) {
246 public IndexReader[] getSequentialSubReaders() {
251 public TermPositions termPositions() throws IOException {
252 return new FilterTermPositions(in.termPositions()) {
255 public boolean next() throws IOException {
257 while ((res = super.next())) {
258 if (!dels.get(doc())) {