X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/index/IndexSorter.java diff --git a/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/index/IndexSorter.java b/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/index/IndexSorter.java new file mode 100644 index 0000000..c0c4ea9 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/index/IndexSorter.java @@ -0,0 +1,352 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.index; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.logging.Logger; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.*; +import org.apache.lucene.index.IndexWriter; // javadocs +import org.apache.lucene.store.*; +import org.apache.lucene.util.Version; + +/** Sort an index by document importance factor. Higher scoring documents are + * assigned smaller document numbers. Document weights are obtained from a + * specified field, which has to be single-valued and stored, with string value + * that represents a float number. Stored fields in the output index remain + * consistent, i.e. both stored fields and postings are renumbered in sync. + * + *

NOTE: this tool is unaware of documents added + * atomically via {@link IndexWriter#addDocuments} or {@link + * IndexWriter#updateDocuments}, which means it can easily + * break up such document groups. + */ +public class IndexSorter { + private static final Logger LOG = Logger.getLogger(IndexSorter.class.getName()); + + private static class PostingMap implements Comparable { + private int newDoc; + private long offset; + + public int compareTo(PostingMap pm) { // order by newDoc id + return this.newDoc - pm.newDoc; + } + } + + private static class SortedTermPositions implements TermPositions { + private TermPositions original; + private int[] oldToNew; + + private int docFreq; + + private PostingMap[] postingMaps = new PostingMap[0]; + private int pointer; + + private int freq; + private int position; + + private static final String TEMP_FILE = "temp"; + private final RAMDirectory tempDir = new RAMDirectory(); + private RAMOutputStream out; + private IndexInput in; + + public SortedTermPositions(TermPositions original, int[] oldToNew) { + this.original = original; + this.oldToNew = oldToNew; + try { + out = (RAMOutputStream)tempDir.createOutput(TEMP_FILE); + } catch (IOException ioe) { + LOG.warning("Error creating temporary output: " + ioe); + } + } + + public void seek(Term term) throws IOException { + throw new UnsupportedOperationException(); + } + + public void seek(TermEnum terms) throws IOException { + original.seek(terms); + + docFreq = terms.docFreq(); + pointer = -1; + + if (docFreq > postingMaps.length) { // grow postingsMap + PostingMap[] newMap = new PostingMap[docFreq]; + System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length); + for (int i = postingMaps.length; i < docFreq; i++) { + newMap[i] = new PostingMap(); + } + postingMaps = newMap; + } + + out.reset(); + + int i = 0; + while (original.next()) { + PostingMap map = postingMaps[i++]; + map.newDoc = oldToNew[original.doc()]; // remap the newDoc id + map.offset = out.getFilePointer(); // save pointer to buffer + + final int tf = original.freq(); // buffer tf & positions + out.writeVInt(tf); + int prevPosition = 0; + for (int j = tf; j > 0; j--) { // delta encode positions + int p = original.nextPosition(); + out.writeVInt(p - prevPosition); + prevPosition = p; + } + } + out.flush(); + docFreq = i; // allow for deletions + + Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids + + // NOTE: this might be substantially faster if RAMInputStream were public + // and supported a reset() operation. + in = tempDir.openInput(TEMP_FILE); + } + + public boolean next() throws IOException { + pointer++; + if (pointer < docFreq) { + in.seek(postingMaps[pointer].offset); + freq = in.readVInt(); + position = 0; + return true; + } + return false; + } + + public int doc() { return postingMaps[pointer].newDoc; } + public int freq() { return freq; } + + public int nextPosition() throws IOException { + int positionIncrement = in.readVInt(); + position += positionIncrement; + return position; + } + + public int read(int[] docs, int[] freqs) { + throw new UnsupportedOperationException(); + } + public boolean skipTo(int target) { + throw new UnsupportedOperationException(); + } + + public byte[] getPayload(byte[] data, int offset) throws IOException { + return null; + } + + public int getPayloadLength() { + return 0; + } + + public boolean isPayloadAvailable() { + return false; + } + + public void close() throws IOException { + original.close(); + } + + } + + private static class SortingReader extends FilterIndexReader { + + private int[] oldToNew; + private int[] newToOld; + + public SortingReader(IndexReader oldReader, int[] oldToNew) { + super(oldReader); + this.oldToNew = oldToNew; + + this.newToOld = new int[oldReader.maxDoc()]; + int oldDoc = 0; + while (oldDoc < oldToNew.length) { + int newDoc = oldToNew[oldDoc]; + if (newDoc != -1) { + newToOld[newDoc] = oldDoc; + } + oldDoc++; + } + } + + @Override + public IndexReader[] getSequentialSubReaders() { + return null; + } + + @Override + public Document document(int n) throws IOException { + return document(n, null); + } + + @Override + public Document document(int n, FieldSelector fieldSelector) + throws CorruptIndexException, IOException { + return super.document(newToOld[n], fieldSelector); + } + + @Override + public boolean isDeleted(int n) { + return false; + } + + @Override + public byte[] norms(String f) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void norms(String f, byte[] norms, int offset) throws IOException { + byte[] oldNorms = super.norms(f); + int oldDoc = 0; + while (oldDoc < oldNorms.length) { + int newDoc = oldToNew[oldDoc]; + if (newDoc != -1) { + norms[newDoc] = oldNorms[oldDoc]; + } + oldDoc++; + } + } + + @Override + protected void doSetNorm(int d, String f, byte b) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public TermDocs termDocs() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public TermPositions termPositions() throws IOException { + return new SortedTermPositions(super.termPositions(), oldToNew); + } + + @Override + public TermFreqVector[] getTermFreqVectors(int docNumber) + throws IOException { + return super.getTermFreqVectors(newToOld[docNumber]); + } + + @Override + protected void doDelete(int n) throws IOException { + throw new UnsupportedOperationException(); + } + + } + + private static class DocScore implements Comparable { + private int oldDoc; + private float score; + + public int compareTo(DocScore that) { // order by score, oldDoc + if (this.score == that.score) { + return this.oldDoc - that.oldDoc; + } else { + return this.score < that.score ? 1 : -1 ; + } + } + + @Override + public String toString() { + return "oldDoc=" + oldDoc + ",score=" + score; + } + } + + public IndexSorter() { + + } + + public void sort(Directory input, Directory output, String field) throws IOException { + LOG.info("IndexSorter: starting."); + long start = System.currentTimeMillis(); + IndexReader reader = IndexReader.open(input, true); + + SortingReader sorter = new SortingReader(reader, oldToNew(reader, field)); + IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_31, new WhitespaceAnalyzer(Version.LUCENE_31)); + IndexWriter writer = new IndexWriter(output, cfg); + writer.addIndexes(new IndexReader[] { sorter }); + writer.close(); + long end = System.currentTimeMillis(); + LOG.info("IndexSorter: done, " + (end - start) + + " total milliseconds"); + } + + private static int[] oldToNew(IndexReader reader, String field) throws IOException { + int readerMax = reader.maxDoc(); + DocScore[] newToOld = new DocScore[readerMax]; + FieldSelector fSel = new MapFieldSelector(field); + + for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) { + float score; + if (reader.isDeleted(oldDoc)) { + score = 0.0f; + } else { + Document d = reader.document(oldDoc, fSel); + try { + score = Float.parseFloat(d.get(field)); + } catch (Exception e) { + score = 0.0f; + } + } + DocScore docScore = new DocScore(); + docScore.oldDoc = oldDoc; + docScore.score = score; + newToOld[oldDoc] = docScore; + } + Arrays.sort(newToOld); + + int[] oldToNew = new int[readerMax]; + for (int newDoc = 0; newDoc < readerMax; newDoc++) { + DocScore docScore = newToOld[newDoc]; + oldToNew[docScore.oldDoc] = newDoc; + } + return oldToNew; + } + + /** */ + public static void main(String[] args) throws Exception { + Directory input, output; + String field; + + String usage = "IndexSorter "; + + if (args.length < 3) { + System.err.println("Usage: " + usage); + System.exit(-1); + } + + input = FSDirectory.open(new File(args[0])); + File out = new File(args[1]); + if (!out.exists()) out.mkdirs(); + output = FSDirectory.open(out); + field = args[2]; + IndexSorter sorter = new IndexSorter(); + try { + sorter.sort(input, output, field); + } catch (Exception e) { + LOG.warning("IndexSorter: " + e); + } + } +}