X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java diff --git a/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java b/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java new file mode 100644 index 0000000..2da66f8 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java @@ -0,0 +1,267 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.index.IndexWriter; // javadoc +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.Version; + +/** + * This tool splits input index into multiple equal parts. The method employed + * here uses {@link IndexWriter#addIndexes(IndexReader[])} where the input data + * comes from the input index with artificially applied deletes to the document + * id-s that fall outside the selected partition. + *
Note 1: Deletes are only applied to a buffered list of deleted docs and + * don't affect the source index - this tool works also with read-only indexes. + *
Note 2: the disadvantage of this tool is that source index needs to be + * read as many times as there are parts to be created, hence the name of this + * tool. + * + *
NOTE: this tool is unaware of documents added
+ * atomically via {@link IndexWriter#addDocuments} or {@link
+ * IndexWriter#updateDocuments}, which means it can easily
+ * break up such document groups.
+ */
+public class MultiPassIndexSplitter {
+ /**
+ * Split source index into multiple parts.
+ * @param input source index, can be read-only, can have deletions, can have
+ * multiple segments (or multiple readers).
+ * @param outputs list of directories where the output parts will be stored.
+ * @param seq if true, then the source index will be split into equal
+ * increasing ranges of document id-s. If false, source document id-s will be
+ * assigned in a deterministic round-robin fashion to one of the output splits.
+ * @throws IOException
+ * @deprecated use {@link #split(Version, IndexReader, Directory[], boolean)} instead.
+ * This method will be removed in Lucene 4.0.
+ */
+ @Deprecated
+ public void split(IndexReader input, Directory[] outputs, boolean seq) throws IOException {
+ split(Version.LUCENE_CURRENT, input, outputs, seq);
+ }
+
+ /**
+ * Split source index into multiple parts.
+ * @param input source index, can be read-only, can have deletions, can have
+ * multiple segments (or multiple readers).
+ * @param outputs list of directories where the output parts will be stored.
+ * @param seq if true, then the source index will be split into equal
+ * increasing ranges of document id-s. If false, source document id-s will be
+ * assigned in a deterministic round-robin fashion to one of the output splits.
+ * @throws IOException
+ */
+ public void split(Version version, IndexReader input, Directory[] outputs, boolean seq) throws IOException {
+ if (outputs == null || outputs.length < 2) {
+ throw new IOException("Invalid number of outputs.");
+ }
+ if (input == null || input.numDocs() < 2) {
+ throw new IOException("Not enough documents for splitting");
+ }
+ int numParts = outputs.length;
+ // wrap a potentially read-only input
+ // this way we don't have to preserve original deletions because neither
+ // deleteDocument(int) or undeleteAll() is applied to the wrapped input index.
+ input = new FakeDeleteIndexReader(input);
+ int maxDoc = input.maxDoc();
+ int partLen = maxDoc / numParts;
+ for (int i = 0; i < numParts; i++) {
+ input.undeleteAll();
+ if (seq) { // sequential range
+ int lo = partLen * i;
+ int hi = lo + partLen;
+ // below range
+ for (int j = 0; j < lo; j++) {
+ input.deleteDocument(j);
+ }
+ // above range - last part collects all id-s that remained due to
+ // integer rounding errors
+ if (i < numParts - 1) {
+ for (int j = hi; j < maxDoc; j++) {
+ input.deleteDocument(j);
+ }
+ }
+ } else {
+ // round-robin
+ for (int j = 0; j < maxDoc; j++) {
+ if ((j + numParts - i) % numParts != 0) {
+ input.deleteDocument(j);
+ }
+ }
+ }
+ IndexWriter w = new IndexWriter(outputs[i], new IndexWriterConfig(
+ version,
+ new WhitespaceAnalyzer(Version.LUCENE_CURRENT))
+ .setOpenMode(OpenMode.CREATE));
+ System.err.println("Writing part " + (i + 1) + " ...");
+ w.addIndexes(new IndexReader[]{input});
+ w.close();
+ }
+ System.err.println("Done.");
+ }
+
+ @SuppressWarnings("deprecation")
+ public static void main(String[] args) throws Exception {
+ if (args.length < 5) {
+ System.err.println("Usage: MultiPassIndexSplitter -out