2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.apache.lucene.index;
21 import java.io.IOException;
22 import java.util.Arrays;
23 import java.util.logging.Logger;
25 import org.apache.lucene.analysis.WhitespaceAnalyzer;
26 import org.apache.lucene.document.*;
27 import org.apache.lucene.index.IndexWriter; // javadocs
28 import org.apache.lucene.store.*;
29 import org.apache.lucene.util.Version;
31 /** Sort an index by document importance factor. Higher scoring documents are
32 * assigned smaller document numbers. Document weights are obtained from a
33 * specified field, which has to be single-valued and stored, with string value
34 * that represents a float number. Stored fields in the output index remain
35 * consistent, i.e. both stored fields and postings are renumbered in sync.
37 * <p><b>NOTE</b>: this tool is unaware of documents added
38 * atomically via {@link IndexWriter#addDocuments} or {@link
39 * IndexWriter#updateDocuments}, which means it can easily
40 * break up such document groups.
42 public class IndexSorter {
43 private static final Logger LOG = Logger.getLogger(IndexSorter.class.getName());
45 private static class PostingMap implements Comparable<PostingMap> {
49 public int compareTo(PostingMap pm) { // order by newDoc id
50 return this.newDoc - pm.newDoc;
54 private static class SortedTermPositions implements TermPositions {
55 private TermPositions original;
56 private int[] oldToNew;
60 private PostingMap[] postingMaps = new PostingMap[0];
66 private static final String TEMP_FILE = "temp";
67 private final RAMDirectory tempDir = new RAMDirectory();
68 private RAMOutputStream out;
69 private IndexInput in;
71 public SortedTermPositions(TermPositions original, int[] oldToNew) {
72 this.original = original;
73 this.oldToNew = oldToNew;
75 out = (RAMOutputStream)tempDir.createOutput(TEMP_FILE);
76 } catch (IOException ioe) {
77 LOG.warning("Error creating temporary output: " + ioe);
81 public void seek(Term term) throws IOException {
82 throw new UnsupportedOperationException();
85 public void seek(TermEnum terms) throws IOException {
88 docFreq = terms.docFreq();
91 if (docFreq > postingMaps.length) { // grow postingsMap
92 PostingMap[] newMap = new PostingMap[docFreq];
93 System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length);
94 for (int i = postingMaps.length; i < docFreq; i++) {
95 newMap[i] = new PostingMap();
103 while (original.next()) {
104 PostingMap map = postingMaps[i++];
105 map.newDoc = oldToNew[original.doc()]; // remap the newDoc id
106 map.offset = out.getFilePointer(); // save pointer to buffer
108 final int tf = original.freq(); // buffer tf & positions
110 int prevPosition = 0;
111 for (int j = tf; j > 0; j--) { // delta encode positions
112 int p = original.nextPosition();
113 out.writeVInt(p - prevPosition);
118 docFreq = i; // allow for deletions
120 Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids
122 // NOTE: this might be substantially faster if RAMInputStream were public
123 // and supported a reset() operation.
124 in = tempDir.openInput(TEMP_FILE);
127 public boolean next() throws IOException {
129 if (pointer < docFreq) {
130 in.seek(postingMaps[pointer].offset);
131 freq = in.readVInt();
138 public int doc() { return postingMaps[pointer].newDoc; }
139 public int freq() { return freq; }
141 public int nextPosition() throws IOException {
142 int positionIncrement = in.readVInt();
143 position += positionIncrement;
147 public int read(int[] docs, int[] freqs) {
148 throw new UnsupportedOperationException();
150 public boolean skipTo(int target) {
151 throw new UnsupportedOperationException();
154 public byte[] getPayload(byte[] data, int offset) throws IOException {
158 public int getPayloadLength() {
162 public boolean isPayloadAvailable() {
166 public void close() throws IOException {
172 private static class SortingReader extends FilterIndexReader {
174 private int[] oldToNew;
175 private int[] newToOld;
177 public SortingReader(IndexReader oldReader, int[] oldToNew) {
179 this.oldToNew = oldToNew;
181 this.newToOld = new int[oldReader.maxDoc()];
183 while (oldDoc < oldToNew.length) {
184 int newDoc = oldToNew[oldDoc];
186 newToOld[newDoc] = oldDoc;
193 public IndexReader[] getSequentialSubReaders() {
198 public Document document(int n) throws IOException {
199 return document(n, null);
203 public Document document(int n, FieldSelector fieldSelector)
204 throws CorruptIndexException, IOException {
205 return super.document(newToOld[n], fieldSelector);
209 public boolean isDeleted(int n) {
214 public byte[] norms(String f) throws IOException {
215 throw new UnsupportedOperationException();
219 public void norms(String f, byte[] norms, int offset) throws IOException {
220 byte[] oldNorms = super.norms(f);
222 while (oldDoc < oldNorms.length) {
223 int newDoc = oldToNew[oldDoc];
225 norms[newDoc] = oldNorms[oldDoc];
232 protected void doSetNorm(int d, String f, byte b) throws IOException {
233 throw new UnsupportedOperationException();
237 public TermDocs termDocs() throws IOException {
238 throw new UnsupportedOperationException();
242 public TermPositions termPositions() throws IOException {
243 return new SortedTermPositions(super.termPositions(), oldToNew);
247 public TermFreqVector[] getTermFreqVectors(int docNumber)
249 return super.getTermFreqVectors(newToOld[docNumber]);
253 protected void doDelete(int n) throws IOException {
254 throw new UnsupportedOperationException();
259 private static class DocScore implements Comparable<DocScore> {
263 public int compareTo(DocScore that) { // order by score, oldDoc
264 if (this.score == that.score) {
265 return this.oldDoc - that.oldDoc;
267 return this.score < that.score ? 1 : -1 ;
272 public String toString() {
273 return "oldDoc=" + oldDoc + ",score=" + score;
277 public IndexSorter() {
281 public void sort(Directory input, Directory output, String field) throws IOException {
282 LOG.info("IndexSorter: starting.");
283 long start = System.currentTimeMillis();
284 IndexReader reader = IndexReader.open(input, true);
286 SortingReader sorter = new SortingReader(reader, oldToNew(reader, field));
287 IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_31, new WhitespaceAnalyzer(Version.LUCENE_31));
288 IndexWriter writer = new IndexWriter(output, cfg);
289 writer.addIndexes(new IndexReader[] { sorter });
291 long end = System.currentTimeMillis();
292 LOG.info("IndexSorter: done, " + (end - start)
293 + " total milliseconds");
296 private static int[] oldToNew(IndexReader reader, String field) throws IOException {
297 int readerMax = reader.maxDoc();
298 DocScore[] newToOld = new DocScore[readerMax];
299 FieldSelector fSel = new MapFieldSelector(field);
301 for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) {
303 if (reader.isDeleted(oldDoc)) {
306 Document d = reader.document(oldDoc, fSel);
308 score = Float.parseFloat(d.get(field));
309 } catch (Exception e) {
313 DocScore docScore = new DocScore();
314 docScore.oldDoc = oldDoc;
315 docScore.score = score;
316 newToOld[oldDoc] = docScore;
318 Arrays.sort(newToOld);
320 int[] oldToNew = new int[readerMax];
321 for (int newDoc = 0; newDoc < readerMax; newDoc++) {
322 DocScore docScore = newToOld[newDoc];
323 oldToNew[docScore.oldDoc] = newDoc;
329 public static void main(String[] args) throws Exception {
330 Directory input, output;
333 String usage = "IndexSorter <input> <output> <field>";
335 if (args.length < 3) {
336 System.err.println("Usage: " + usage);
340 input = FSDirectory.open(new File(args[0]));
341 File out = new File(args[1]);
342 if (!out.exists()) out.mkdirs();
343 output = FSDirectory.open(out);
345 IndexSorter sorter = new IndexSorter();
347 sorter.sort(input, output, field);
348 } catch (Exception e) {
349 LOG.warning("IndexSorter: " + e);