+++ /dev/null
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.index;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.logging.Logger;
-
-import org.apache.lucene.analysis.WhitespaceAnalyzer;
-import org.apache.lucene.document.*;
-import org.apache.lucene.index.IndexWriter; // javadocs
-import org.apache.lucene.store.*;
-import org.apache.lucene.util.Version;
-
-/** Sort an index by document importance factor. Higher scoring documents are
- * assigned smaller document numbers. Document weights are obtained from a
- * specified field, which has to be single-valued and stored, with string value
- * that represents a float number. Stored fields in the output index remain
- * consistent, i.e. both stored fields and postings are renumbered in sync.
- *
- * <p><b>NOTE</b>: this tool is unaware of documents added
- * atomically via {@link IndexWriter#addDocuments} or {@link
- * IndexWriter#updateDocuments}, which means it can easily
- * break up such document groups.
- */
-public class IndexSorter {
- private static final Logger LOG = Logger.getLogger(IndexSorter.class.getName());
-
- private static class PostingMap implements Comparable<PostingMap> {
- private int newDoc;
- private long offset;
-
- public int compareTo(PostingMap pm) { // order by newDoc id
- return this.newDoc - pm.newDoc;
- }
- }
-
- private static class SortedTermPositions implements TermPositions {
- private TermPositions original;
- private int[] oldToNew;
-
- private int docFreq;
-
- private PostingMap[] postingMaps = new PostingMap[0];
- private int pointer;
-
- private int freq;
- private int position;
-
- private static final String TEMP_FILE = "temp";
- private final RAMDirectory tempDir = new RAMDirectory();
- private RAMOutputStream out;
- private IndexInput in;
-
- public SortedTermPositions(TermPositions original, int[] oldToNew) {
- this.original = original;
- this.oldToNew = oldToNew;
- try {
- out = (RAMOutputStream)tempDir.createOutput(TEMP_FILE);
- } catch (IOException ioe) {
- LOG.warning("Error creating temporary output: " + ioe);
- }
- }
-
- public void seek(Term term) throws IOException {
- throw new UnsupportedOperationException();
- }
-
- public void seek(TermEnum terms) throws IOException {
- original.seek(terms);
-
- docFreq = terms.docFreq();
- pointer = -1;
-
- if (docFreq > postingMaps.length) { // grow postingsMap
- PostingMap[] newMap = new PostingMap[docFreq];
- System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length);
- for (int i = postingMaps.length; i < docFreq; i++) {
- newMap[i] = new PostingMap();
- }
- postingMaps = newMap;
- }
-
- out.reset();
-
- int i = 0;
- while (original.next()) {
- PostingMap map = postingMaps[i++];
- map.newDoc = oldToNew[original.doc()]; // remap the newDoc id
- map.offset = out.getFilePointer(); // save pointer to buffer
-
- final int tf = original.freq(); // buffer tf & positions
- out.writeVInt(tf);
- int prevPosition = 0;
- for (int j = tf; j > 0; j--) { // delta encode positions
- int p = original.nextPosition();
- out.writeVInt(p - prevPosition);
- prevPosition = p;
- }
- }
- out.flush();
- docFreq = i; // allow for deletions
-
- Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids
-
- // NOTE: this might be substantially faster if RAMInputStream were public
- // and supported a reset() operation.
- in = tempDir.openInput(TEMP_FILE);
- }
-
- public boolean next() throws IOException {
- pointer++;
- if (pointer < docFreq) {
- in.seek(postingMaps[pointer].offset);
- freq = in.readVInt();
- position = 0;
- return true;
- }
- return false;
- }
-
- public int doc() { return postingMaps[pointer].newDoc; }
- public int freq() { return freq; }
-
- public int nextPosition() throws IOException {
- int positionIncrement = in.readVInt();
- position += positionIncrement;
- return position;
- }
-
- public int read(int[] docs, int[] freqs) {
- throw new UnsupportedOperationException();
- }
- public boolean skipTo(int target) {
- throw new UnsupportedOperationException();
- }
-
- public byte[] getPayload(byte[] data, int offset) throws IOException {
- return null;
- }
-
- public int getPayloadLength() {
- return 0;
- }
-
- public boolean isPayloadAvailable() {
- return false;
- }
-
- public void close() throws IOException {
- original.close();
- }
-
- }
-
- private static class SortingReader extends FilterIndexReader {
-
- private int[] oldToNew;
- private int[] newToOld;
-
- public SortingReader(IndexReader oldReader, int[] oldToNew) {
- super(oldReader);
- this.oldToNew = oldToNew;
-
- this.newToOld = new int[oldReader.maxDoc()];
- int oldDoc = 0;
- while (oldDoc < oldToNew.length) {
- int newDoc = oldToNew[oldDoc];
- if (newDoc != -1) {
- newToOld[newDoc] = oldDoc;
- }
- oldDoc++;
- }
- }
-
- @Override
- public IndexReader[] getSequentialSubReaders() {
- return null;
- }
-
- @Override
- public Document document(int n) throws IOException {
- return document(n, null);
- }
-
- @Override
- public Document document(int n, FieldSelector fieldSelector)
- throws CorruptIndexException, IOException {
- return super.document(newToOld[n], fieldSelector);
- }
-
- @Override
- public boolean isDeleted(int n) {
- return false;
- }
-
- @Override
- public byte[] norms(String f) throws IOException {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public void norms(String f, byte[] norms, int offset) throws IOException {
- byte[] oldNorms = super.norms(f);
- int oldDoc = 0;
- while (oldDoc < oldNorms.length) {
- int newDoc = oldToNew[oldDoc];
- if (newDoc != -1) {
- norms[newDoc] = oldNorms[oldDoc];
- }
- oldDoc++;
- }
- }
-
- @Override
- protected void doSetNorm(int d, String f, byte b) throws IOException {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public TermDocs termDocs() throws IOException {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public TermPositions termPositions() throws IOException {
- return new SortedTermPositions(super.termPositions(), oldToNew);
- }
-
- @Override
- public TermFreqVector[] getTermFreqVectors(int docNumber)
- throws IOException {
- return super.getTermFreqVectors(newToOld[docNumber]);
- }
-
- @Override
- protected void doDelete(int n) throws IOException {
- throw new UnsupportedOperationException();
- }
-
- }
-
- private static class DocScore implements Comparable<DocScore> {
- private int oldDoc;
- private float score;
-
- public int compareTo(DocScore that) { // order by score, oldDoc
- if (this.score == that.score) {
- return this.oldDoc - that.oldDoc;
- } else {
- return this.score < that.score ? 1 : -1 ;
- }
- }
-
- @Override
- public String toString() {
- return "oldDoc=" + oldDoc + ",score=" + score;
- }
- }
-
- public IndexSorter() {
-
- }
-
- public void sort(Directory input, Directory output, String field) throws IOException {
- LOG.info("IndexSorter: starting.");
- long start = System.currentTimeMillis();
- IndexReader reader = IndexReader.open(input, true);
-
- SortingReader sorter = new SortingReader(reader, oldToNew(reader, field));
- IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_31, new WhitespaceAnalyzer(Version.LUCENE_31));
- IndexWriter writer = new IndexWriter(output, cfg);
- writer.addIndexes(new IndexReader[] { sorter });
- writer.close();
- long end = System.currentTimeMillis();
- LOG.info("IndexSorter: done, " + (end - start)
- + " total milliseconds");
- }
-
- private static int[] oldToNew(IndexReader reader, String field) throws IOException {
- int readerMax = reader.maxDoc();
- DocScore[] newToOld = new DocScore[readerMax];
- FieldSelector fSel = new MapFieldSelector(field);
-
- for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) {
- float score;
- if (reader.isDeleted(oldDoc)) {
- score = 0.0f;
- } else {
- Document d = reader.document(oldDoc, fSel);
- try {
- score = Float.parseFloat(d.get(field));
- } catch (Exception e) {
- score = 0.0f;
- }
- }
- DocScore docScore = new DocScore();
- docScore.oldDoc = oldDoc;
- docScore.score = score;
- newToOld[oldDoc] = docScore;
- }
- Arrays.sort(newToOld);
-
- int[] oldToNew = new int[readerMax];
- for (int newDoc = 0; newDoc < readerMax; newDoc++) {
- DocScore docScore = newToOld[newDoc];
- oldToNew[docScore.oldDoc] = newDoc;
- }
- return oldToNew;
- }
-
- /** */
- public static void main(String[] args) throws Exception {
- Directory input, output;
- String field;
-
- String usage = "IndexSorter <input> <output> <field>";
-
- if (args.length < 3) {
- System.err.println("Usage: " + usage);
- System.exit(-1);
- }
-
- input = FSDirectory.open(new File(args[0]));
- File out = new File(args[1]);
- if (!out.exists()) out.mkdirs();
- output = FSDirectory.open(out);
- field = args[2];
- IndexSorter sorter = new IndexSorter();
- try {
- sorter.sort(input, output, field);
- } catch (Exception e) {
- LOG.warning("IndexSorter: " + e);
- }
- }
-}