X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java?ds=sidebyside diff --git a/lucene-java-3.4.0/lucene/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java b/lucene-java-3.4.0/lucene/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java deleted file mode 100644 index 2da66f8..0000000 --- a/lucene-java-3.4.0/lucene/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java +++ /dev/null @@ -1,267 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; - -import org.apache.lucene.analysis.WhitespaceAnalyzer; -import org.apache.lucene.index.IndexWriter; // javadoc -import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.FixedBitSet; -import org.apache.lucene.util.Version; - -/** - * This tool splits input index into multiple equal parts. The method employed - * here uses {@link IndexWriter#addIndexes(IndexReader[])} where the input data - * comes from the input index with artificially applied deletes to the document - * id-s that fall outside the selected partition. - *

Note 1: Deletes are only applied to a buffered list of deleted docs and - * don't affect the source index - this tool works also with read-only indexes. - *

Note 2: the disadvantage of this tool is that source index needs to be - * read as many times as there are parts to be created, hence the name of this - * tool. - * - *

NOTE: this tool is unaware of documents added - * atomically via {@link IndexWriter#addDocuments} or {@link - * IndexWriter#updateDocuments}, which means it can easily - * break up such document groups. - */ -public class MultiPassIndexSplitter { - /** - * Split source index into multiple parts. - * @param input source index, can be read-only, can have deletions, can have - * multiple segments (or multiple readers). - * @param outputs list of directories where the output parts will be stored. - * @param seq if true, then the source index will be split into equal - * increasing ranges of document id-s. If false, source document id-s will be - * assigned in a deterministic round-robin fashion to one of the output splits. - * @throws IOException - * @deprecated use {@link #split(Version, IndexReader, Directory[], boolean)} instead. - * This method will be removed in Lucene 4.0. - */ - @Deprecated - public void split(IndexReader input, Directory[] outputs, boolean seq) throws IOException { - split(Version.LUCENE_CURRENT, input, outputs, seq); - } - - /** - * Split source index into multiple parts. - * @param input source index, can be read-only, can have deletions, can have - * multiple segments (or multiple readers). - * @param outputs list of directories where the output parts will be stored. - * @param seq if true, then the source index will be split into equal - * increasing ranges of document id-s. If false, source document id-s will be - * assigned in a deterministic round-robin fashion to one of the output splits. - * @throws IOException - */ - public void split(Version version, IndexReader input, Directory[] outputs, boolean seq) throws IOException { - if (outputs == null || outputs.length < 2) { - throw new IOException("Invalid number of outputs."); - } - if (input == null || input.numDocs() < 2) { - throw new IOException("Not enough documents for splitting"); - } - int numParts = outputs.length; - // wrap a potentially read-only input - // this way we don't have to preserve original deletions because neither - // deleteDocument(int) or undeleteAll() is applied to the wrapped input index. - input = new FakeDeleteIndexReader(input); - int maxDoc = input.maxDoc(); - int partLen = maxDoc / numParts; - for (int i = 0; i < numParts; i++) { - input.undeleteAll(); - if (seq) { // sequential range - int lo = partLen * i; - int hi = lo + partLen; - // below range - for (int j = 0; j < lo; j++) { - input.deleteDocument(j); - } - // above range - last part collects all id-s that remained due to - // integer rounding errors - if (i < numParts - 1) { - for (int j = hi; j < maxDoc; j++) { - input.deleteDocument(j); - } - } - } else { - // round-robin - for (int j = 0; j < maxDoc; j++) { - if ((j + numParts - i) % numParts != 0) { - input.deleteDocument(j); - } - } - } - IndexWriter w = new IndexWriter(outputs[i], new IndexWriterConfig( - version, - new WhitespaceAnalyzer(Version.LUCENE_CURRENT)) - .setOpenMode(OpenMode.CREATE)); - System.err.println("Writing part " + (i + 1) + " ..."); - w.addIndexes(new IndexReader[]{input}); - w.close(); - } - System.err.println("Done."); - } - - @SuppressWarnings("deprecation") - public static void main(String[] args) throws Exception { - if (args.length < 5) { - System.err.println("Usage: MultiPassIndexSplitter -out -num [-seq] [ indexes = new ArrayList(); - String outDir = null; - int numParts = -1; - boolean seq = false; - for (int i = 0; i < args.length; i++) { - if (args[i].equals("-out")) { - outDir = args[++i]; - } else if (args[i].equals("-num")) { - numParts = Integer.parseInt(args[++i]); - } else if (args[i].equals("-seq")) { - seq = true; - } else { - File file = new File(args[i]); - if (!file.exists() || !file.isDirectory()) { - System.err.println("Invalid input path - skipping: " + file); - continue; - } - Directory dir = FSDirectory.open(new File(args[i])); - try { - if (!IndexReader.indexExists(dir)) { - System.err.println("Invalid input index - skipping: " + file); - continue; - } - } catch (Exception e) { - System.err.println("Invalid input index - skipping: " + file); - continue; - } - indexes.add(IndexReader.open(dir, true)); - } - } - if (outDir == null) { - throw new Exception("Required argument missing: -out outputDir"); - } - if (numParts < 2) { - throw new Exception("Invalid value of required argument: -num numParts"); - } - if (indexes.size() == 0) { - throw new Exception("No input indexes to process"); - } - File out = new File(outDir); - if (!out.mkdirs()) { - throw new Exception("Can't create output directory: " + out); - } - Directory[] dirs = new Directory[numParts]; - for (int i = 0; i < numParts; i++) { - dirs[i] = FSDirectory.open(new File(out, "part-" + i)); - } - MultiPassIndexSplitter splitter = new MultiPassIndexSplitter(); - IndexReader input; - if (indexes.size() == 1) { - input = indexes.get(0); - } else { - input = new MultiReader(indexes.toArray(new IndexReader[indexes.size()])); - } - splitter.split(Version.LUCENE_CURRENT, input, dirs, seq); - } - - /** - * This class pretends that it can write deletions to the underlying index. - * Instead, deletions are buffered in a bitset and overlaid with the original - * list of deletions. - */ - public static class FakeDeleteIndexReader extends FilterIndexReader { - FixedBitSet dels; - FixedBitSet oldDels; - - public FakeDeleteIndexReader(IndexReader in) { - super(in); - dels = new FixedBitSet(in.maxDoc()); - if (in.hasDeletions()) { - oldDels = new FixedBitSet(in.maxDoc()); - for (int i = 0; i < in.maxDoc(); i++) { - if (in.isDeleted(i)) oldDels.set(i); - } - dels.or(oldDels); - } - } - - @Override - public int numDocs() { - return in.maxDoc() - dels.cardinality(); - } - - /** - * Just removes our overlaid deletions - does not undelete the original - * deletions. - */ - @Override - protected void doUndeleteAll() throws CorruptIndexException, IOException { - dels = new FixedBitSet(in.maxDoc()); - if (oldDels != null) { - dels.or(oldDels); - } - } - - @Override - protected void doDelete(int n) throws CorruptIndexException, IOException { - dels.set(n); - } - - @Override - public boolean hasDeletions() { - return in.maxDoc() != this.numDocs(); - } - - @Override - public boolean isDeleted(int n) { - return dels.get(n); - } - - @Override - public IndexReader[] getSequentialSubReaders() { - return null; - } - - @Override - public TermPositions termPositions() throws IOException { - return new FilterTermPositions(in.termPositions()) { - - @Override - public boolean next() throws IOException { - boolean res; - while ((res = super.next())) { - if (!dels.get(doc())) { - break; - } - } - return res; - } - }; - } - } -}