X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/index/IndexSorter.java?ds=inline
diff --git a/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/index/IndexSorter.java b/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/index/IndexSorter.java
new file mode 100644
index 0000000..c0c4ea9
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/index/IndexSorter.java
@@ -0,0 +1,352 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.logging.Logger;
+
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.document.*;
+import org.apache.lucene.index.IndexWriter; // javadocs
+import org.apache.lucene.store.*;
+import org.apache.lucene.util.Version;
+
+/** Sort an index by document importance factor. Higher scoring documents are
+ * assigned smaller document numbers. Document weights are obtained from a
+ * specified field, which has to be single-valued and stored, with string value
+ * that represents a float number. Stored fields in the output index remain
+ * consistent, i.e. both stored fields and postings are renumbered in sync.
+ *
+ *
NOTE: this tool is unaware of documents added
+ * atomically via {@link IndexWriter#addDocuments} or {@link
+ * IndexWriter#updateDocuments}, which means it can easily
+ * break up such document groups.
+ */
+public class IndexSorter {
+ private static final Logger LOG = Logger.getLogger(IndexSorter.class.getName());
+
+ private static class PostingMap implements Comparable {
+ private int newDoc;
+ private long offset;
+
+ public int compareTo(PostingMap pm) { // order by newDoc id
+ return this.newDoc - pm.newDoc;
+ }
+ }
+
+ private static class SortedTermPositions implements TermPositions {
+ private TermPositions original;
+ private int[] oldToNew;
+
+ private int docFreq;
+
+ private PostingMap[] postingMaps = new PostingMap[0];
+ private int pointer;
+
+ private int freq;
+ private int position;
+
+ private static final String TEMP_FILE = "temp";
+ private final RAMDirectory tempDir = new RAMDirectory();
+ private RAMOutputStream out;
+ private IndexInput in;
+
+ public SortedTermPositions(TermPositions original, int[] oldToNew) {
+ this.original = original;
+ this.oldToNew = oldToNew;
+ try {
+ out = (RAMOutputStream)tempDir.createOutput(TEMP_FILE);
+ } catch (IOException ioe) {
+ LOG.warning("Error creating temporary output: " + ioe);
+ }
+ }
+
+ public void seek(Term term) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ public void seek(TermEnum terms) throws IOException {
+ original.seek(terms);
+
+ docFreq = terms.docFreq();
+ pointer = -1;
+
+ if (docFreq > postingMaps.length) { // grow postingsMap
+ PostingMap[] newMap = new PostingMap[docFreq];
+ System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length);
+ for (int i = postingMaps.length; i < docFreq; i++) {
+ newMap[i] = new PostingMap();
+ }
+ postingMaps = newMap;
+ }
+
+ out.reset();
+
+ int i = 0;
+ while (original.next()) {
+ PostingMap map = postingMaps[i++];
+ map.newDoc = oldToNew[original.doc()]; // remap the newDoc id
+ map.offset = out.getFilePointer(); // save pointer to buffer
+
+ final int tf = original.freq(); // buffer tf & positions
+ out.writeVInt(tf);
+ int prevPosition = 0;
+ for (int j = tf; j > 0; j--) { // delta encode positions
+ int p = original.nextPosition();
+ out.writeVInt(p - prevPosition);
+ prevPosition = p;
+ }
+ }
+ out.flush();
+ docFreq = i; // allow for deletions
+
+ Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids
+
+ // NOTE: this might be substantially faster if RAMInputStream were public
+ // and supported a reset() operation.
+ in = tempDir.openInput(TEMP_FILE);
+ }
+
+ public boolean next() throws IOException {
+ pointer++;
+ if (pointer < docFreq) {
+ in.seek(postingMaps[pointer].offset);
+ freq = in.readVInt();
+ position = 0;
+ return true;
+ }
+ return false;
+ }
+
+ public int doc() { return postingMaps[pointer].newDoc; }
+ public int freq() { return freq; }
+
+ public int nextPosition() throws IOException {
+ int positionIncrement = in.readVInt();
+ position += positionIncrement;
+ return position;
+ }
+
+ public int read(int[] docs, int[] freqs) {
+ throw new UnsupportedOperationException();
+ }
+ public boolean skipTo(int target) {
+ throw new UnsupportedOperationException();
+ }
+
+ public byte[] getPayload(byte[] data, int offset) throws IOException {
+ return null;
+ }
+
+ public int getPayloadLength() {
+ return 0;
+ }
+
+ public boolean isPayloadAvailable() {
+ return false;
+ }
+
+ public void close() throws IOException {
+ original.close();
+ }
+
+ }
+
+ private static class SortingReader extends FilterIndexReader {
+
+ private int[] oldToNew;
+ private int[] newToOld;
+
+ public SortingReader(IndexReader oldReader, int[] oldToNew) {
+ super(oldReader);
+ this.oldToNew = oldToNew;
+
+ this.newToOld = new int[oldReader.maxDoc()];
+ int oldDoc = 0;
+ while (oldDoc < oldToNew.length) {
+ int newDoc = oldToNew[oldDoc];
+ if (newDoc != -1) {
+ newToOld[newDoc] = oldDoc;
+ }
+ oldDoc++;
+ }
+ }
+
+ @Override
+ public IndexReader[] getSequentialSubReaders() {
+ return null;
+ }
+
+ @Override
+ public Document document(int n) throws IOException {
+ return document(n, null);
+ }
+
+ @Override
+ public Document document(int n, FieldSelector fieldSelector)
+ throws CorruptIndexException, IOException {
+ return super.document(newToOld[n], fieldSelector);
+ }
+
+ @Override
+ public boolean isDeleted(int n) {
+ return false;
+ }
+
+ @Override
+ public byte[] norms(String f) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void norms(String f, byte[] norms, int offset) throws IOException {
+ byte[] oldNorms = super.norms(f);
+ int oldDoc = 0;
+ while (oldDoc < oldNorms.length) {
+ int newDoc = oldToNew[oldDoc];
+ if (newDoc != -1) {
+ norms[newDoc] = oldNorms[oldDoc];
+ }
+ oldDoc++;
+ }
+ }
+
+ @Override
+ protected void doSetNorm(int d, String f, byte b) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public TermDocs termDocs() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public TermPositions termPositions() throws IOException {
+ return new SortedTermPositions(super.termPositions(), oldToNew);
+ }
+
+ @Override
+ public TermFreqVector[] getTermFreqVectors(int docNumber)
+ throws IOException {
+ return super.getTermFreqVectors(newToOld[docNumber]);
+ }
+
+ @Override
+ protected void doDelete(int n) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ }
+
+ private static class DocScore implements Comparable {
+ private int oldDoc;
+ private float score;
+
+ public int compareTo(DocScore that) { // order by score, oldDoc
+ if (this.score == that.score) {
+ return this.oldDoc - that.oldDoc;
+ } else {
+ return this.score < that.score ? 1 : -1 ;
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "oldDoc=" + oldDoc + ",score=" + score;
+ }
+ }
+
+ public IndexSorter() {
+
+ }
+
+ public void sort(Directory input, Directory output, String field) throws IOException {
+ LOG.info("IndexSorter: starting.");
+ long start = System.currentTimeMillis();
+ IndexReader reader = IndexReader.open(input, true);
+
+ SortingReader sorter = new SortingReader(reader, oldToNew(reader, field));
+ IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_31, new WhitespaceAnalyzer(Version.LUCENE_31));
+ IndexWriter writer = new IndexWriter(output, cfg);
+ writer.addIndexes(new IndexReader[] { sorter });
+ writer.close();
+ long end = System.currentTimeMillis();
+ LOG.info("IndexSorter: done, " + (end - start)
+ + " total milliseconds");
+ }
+
+ private static int[] oldToNew(IndexReader reader, String field) throws IOException {
+ int readerMax = reader.maxDoc();
+ DocScore[] newToOld = new DocScore[readerMax];
+ FieldSelector fSel = new MapFieldSelector(field);
+
+ for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) {
+ float score;
+ if (reader.isDeleted(oldDoc)) {
+ score = 0.0f;
+ } else {
+ Document d = reader.document(oldDoc, fSel);
+ try {
+ score = Float.parseFloat(d.get(field));
+ } catch (Exception e) {
+ score = 0.0f;
+ }
+ }
+ DocScore docScore = new DocScore();
+ docScore.oldDoc = oldDoc;
+ docScore.score = score;
+ newToOld[oldDoc] = docScore;
+ }
+ Arrays.sort(newToOld);
+
+ int[] oldToNew = new int[readerMax];
+ for (int newDoc = 0; newDoc < readerMax; newDoc++) {
+ DocScore docScore = newToOld[newDoc];
+ oldToNew[docScore.oldDoc] = newDoc;
+ }
+ return oldToNew;
+ }
+
+ /** */
+ public static void main(String[] args) throws Exception {
+ Directory input, output;
+ String field;
+
+ String usage = "IndexSorter