X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/misc/src/java/org/apache/lucene/index/IndexSorter.java diff --git a/lucene-java-3.4.0/lucene/contrib/misc/src/java/org/apache/lucene/index/IndexSorter.java b/lucene-java-3.4.0/lucene/contrib/misc/src/java/org/apache/lucene/index/IndexSorter.java deleted file mode 100644 index c0c4ea9..0000000 --- a/lucene-java-3.4.0/lucene/contrib/misc/src/java/org/apache/lucene/index/IndexSorter.java +++ /dev/null @@ -1,352 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.index; - -import java.io.File; -import java.io.IOException; -import java.util.Arrays; -import java.util.logging.Logger; - -import org.apache.lucene.analysis.WhitespaceAnalyzer; -import org.apache.lucene.document.*; -import org.apache.lucene.index.IndexWriter; // javadocs -import org.apache.lucene.store.*; -import org.apache.lucene.util.Version; - -/** Sort an index by document importance factor. Higher scoring documents are - * assigned smaller document numbers. Document weights are obtained from a - * specified field, which has to be single-valued and stored, with string value - * that represents a float number. Stored fields in the output index remain - * consistent, i.e. both stored fields and postings are renumbered in sync. - * - *

NOTE: this tool is unaware of documents added - * atomically via {@link IndexWriter#addDocuments} or {@link - * IndexWriter#updateDocuments}, which means it can easily - * break up such document groups. - */ -public class IndexSorter { - private static final Logger LOG = Logger.getLogger(IndexSorter.class.getName()); - - private static class PostingMap implements Comparable { - private int newDoc; - private long offset; - - public int compareTo(PostingMap pm) { // order by newDoc id - return this.newDoc - pm.newDoc; - } - } - - private static class SortedTermPositions implements TermPositions { - private TermPositions original; - private int[] oldToNew; - - private int docFreq; - - private PostingMap[] postingMaps = new PostingMap[0]; - private int pointer; - - private int freq; - private int position; - - private static final String TEMP_FILE = "temp"; - private final RAMDirectory tempDir = new RAMDirectory(); - private RAMOutputStream out; - private IndexInput in; - - public SortedTermPositions(TermPositions original, int[] oldToNew) { - this.original = original; - this.oldToNew = oldToNew; - try { - out = (RAMOutputStream)tempDir.createOutput(TEMP_FILE); - } catch (IOException ioe) { - LOG.warning("Error creating temporary output: " + ioe); - } - } - - public void seek(Term term) throws IOException { - throw new UnsupportedOperationException(); - } - - public void seek(TermEnum terms) throws IOException { - original.seek(terms); - - docFreq = terms.docFreq(); - pointer = -1; - - if (docFreq > postingMaps.length) { // grow postingsMap - PostingMap[] newMap = new PostingMap[docFreq]; - System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length); - for (int i = postingMaps.length; i < docFreq; i++) { - newMap[i] = new PostingMap(); - } - postingMaps = newMap; - } - - out.reset(); - - int i = 0; - while (original.next()) { - PostingMap map = postingMaps[i++]; - map.newDoc = oldToNew[original.doc()]; // remap the newDoc id - map.offset = out.getFilePointer(); // save pointer to buffer - - final int tf = original.freq(); // buffer tf & positions - out.writeVInt(tf); - int prevPosition = 0; - for (int j = tf; j > 0; j--) { // delta encode positions - int p = original.nextPosition(); - out.writeVInt(p - prevPosition); - prevPosition = p; - } - } - out.flush(); - docFreq = i; // allow for deletions - - Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids - - // NOTE: this might be substantially faster if RAMInputStream were public - // and supported a reset() operation. - in = tempDir.openInput(TEMP_FILE); - } - - public boolean next() throws IOException { - pointer++; - if (pointer < docFreq) { - in.seek(postingMaps[pointer].offset); - freq = in.readVInt(); - position = 0; - return true; - } - return false; - } - - public int doc() { return postingMaps[pointer].newDoc; } - public int freq() { return freq; } - - public int nextPosition() throws IOException { - int positionIncrement = in.readVInt(); - position += positionIncrement; - return position; - } - - public int read(int[] docs, int[] freqs) { - throw new UnsupportedOperationException(); - } - public boolean skipTo(int target) { - throw new UnsupportedOperationException(); - } - - public byte[] getPayload(byte[] data, int offset) throws IOException { - return null; - } - - public int getPayloadLength() { - return 0; - } - - public boolean isPayloadAvailable() { - return false; - } - - public void close() throws IOException { - original.close(); - } - - } - - private static class SortingReader extends FilterIndexReader { - - private int[] oldToNew; - private int[] newToOld; - - public SortingReader(IndexReader oldReader, int[] oldToNew) { - super(oldReader); - this.oldToNew = oldToNew; - - this.newToOld = new int[oldReader.maxDoc()]; - int oldDoc = 0; - while (oldDoc < oldToNew.length) { - int newDoc = oldToNew[oldDoc]; - if (newDoc != -1) { - newToOld[newDoc] = oldDoc; - } - oldDoc++; - } - } - - @Override - public IndexReader[] getSequentialSubReaders() { - return null; - } - - @Override - public Document document(int n) throws IOException { - return document(n, null); - } - - @Override - public Document document(int n, FieldSelector fieldSelector) - throws CorruptIndexException, IOException { - return super.document(newToOld[n], fieldSelector); - } - - @Override - public boolean isDeleted(int n) { - return false; - } - - @Override - public byte[] norms(String f) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public void norms(String f, byte[] norms, int offset) throws IOException { - byte[] oldNorms = super.norms(f); - int oldDoc = 0; - while (oldDoc < oldNorms.length) { - int newDoc = oldToNew[oldDoc]; - if (newDoc != -1) { - norms[newDoc] = oldNorms[oldDoc]; - } - oldDoc++; - } - } - - @Override - protected void doSetNorm(int d, String f, byte b) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public TermDocs termDocs() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public TermPositions termPositions() throws IOException { - return new SortedTermPositions(super.termPositions(), oldToNew); - } - - @Override - public TermFreqVector[] getTermFreqVectors(int docNumber) - throws IOException { - return super.getTermFreqVectors(newToOld[docNumber]); - } - - @Override - protected void doDelete(int n) throws IOException { - throw new UnsupportedOperationException(); - } - - } - - private static class DocScore implements Comparable { - private int oldDoc; - private float score; - - public int compareTo(DocScore that) { // order by score, oldDoc - if (this.score == that.score) { - return this.oldDoc - that.oldDoc; - } else { - return this.score < that.score ? 1 : -1 ; - } - } - - @Override - public String toString() { - return "oldDoc=" + oldDoc + ",score=" + score; - } - } - - public IndexSorter() { - - } - - public void sort(Directory input, Directory output, String field) throws IOException { - LOG.info("IndexSorter: starting."); - long start = System.currentTimeMillis(); - IndexReader reader = IndexReader.open(input, true); - - SortingReader sorter = new SortingReader(reader, oldToNew(reader, field)); - IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_31, new WhitespaceAnalyzer(Version.LUCENE_31)); - IndexWriter writer = new IndexWriter(output, cfg); - writer.addIndexes(new IndexReader[] { sorter }); - writer.close(); - long end = System.currentTimeMillis(); - LOG.info("IndexSorter: done, " + (end - start) - + " total milliseconds"); - } - - private static int[] oldToNew(IndexReader reader, String field) throws IOException { - int readerMax = reader.maxDoc(); - DocScore[] newToOld = new DocScore[readerMax]; - FieldSelector fSel = new MapFieldSelector(field); - - for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) { - float score; - if (reader.isDeleted(oldDoc)) { - score = 0.0f; - } else { - Document d = reader.document(oldDoc, fSel); - try { - score = Float.parseFloat(d.get(field)); - } catch (Exception e) { - score = 0.0f; - } - } - DocScore docScore = new DocScore(); - docScore.oldDoc = oldDoc; - docScore.score = score; - newToOld[oldDoc] = docScore; - } - Arrays.sort(newToOld); - - int[] oldToNew = new int[readerMax]; - for (int newDoc = 0; newDoc < readerMax; newDoc++) { - DocScore docScore = newToOld[newDoc]; - oldToNew[docScore.oldDoc] = newDoc; - } - return oldToNew; - } - - /** */ - public static void main(String[] args) throws Exception { - Directory input, output; - String field; - - String usage = "IndexSorter "; - - if (args.length < 3) { - System.err.println("Usage: " + usage); - System.exit(-1); - } - - input = FSDirectory.open(new File(args[0])); - File out = new File(args[1]); - if (!out.exists()) out.mkdirs(); - output = FSDirectory.open(out); - field = args[2]; - IndexSorter sorter = new IndexSorter(); - try { - sorter.sort(input, output, field); - } catch (Exception e) { - LOG.warning("IndexSorter: " + e); - } - } -}