lucene-java-3.4.0/lucene/src/java/org/apache/lucene/index/TermInfosWriter.java

   1 package org.apache.lucene.index;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20
  21 import java.io.Closeable;
  22 import java.io.IOException;
  23 import org.apache.lucene.store.IndexOutput;
  24 import org.apache.lucene.store.Directory;
  25 import org.apache.lucene.util.IOUtils;
  26 import org.apache.lucene.util.UnicodeUtil;
  27 import org.apache.lucene.util.ArrayUtil;
  28
  29
  30 /** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
  31   Directory.  A TermInfos can be written once, in order.  */
  32
  33 final class TermInfosWriter implements Closeable {
  34   /** The file format version, a negative number. */
  35   public static final int FORMAT = -3;
  36
  37   // Changed strings to true utf8 with length-in-bytes not
  38   // length-in-chars
  39   public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4;
  40
  41   // NOTE: always change this if you switch to a new format!
  42   public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
  43
  44   private FieldInfos fieldInfos;
  45   private IndexOutput output;
  46   private TermInfo lastTi = new TermInfo();
  47   private long size;
  48
  49   // TODO: the default values for these two parameters should be settable from
  50   // IndexWriter.  However, once that's done, folks will start setting them to
  51   // ridiculous values and complaining that things don't work well, as with
  52   // mergeFactor.  So, let's wait until a number of folks find that alternate
  53   // values work better.  Note that both of these values are stored in the
  54   // segment, so that it's safe to change these w/o rebuilding all indexes.
  55
  56   /** Expert: The fraction of terms in the "dictionary" which should be stored
  57    * in RAM.  Smaller values use more memory, but make searching slightly
  58    * faster, while larger values use less memory and make searching slightly
  59    * slower.  Searching is typically not dominated by dictionary lookup, so
  60    * tweaking this is rarely useful.*/
  61   int indexInterval = 128;
  62
  63   /** Expert: The fraction of {@link TermDocs} entries stored in skip tables,
  64    * used to accelerate {@link TermDocs#skipTo(int)}.  Larger values result in
  65    * smaller indexes, greater acceleration, but fewer accelerable cases, while
  66    * smaller values result in bigger indexes, less acceleration and more
  67    * accelerable cases. More detailed experiments would be useful here. */
  68   int skipInterval = 16;
  69
  70   /** Expert: The maximum number of skip levels. Smaller values result in
  71    * slightly smaller indexes, but slower skipping in big posting lists.
  72    */
  73   int maxSkipLevels = 10;
  74
  75   private long lastIndexPointer;
  76   private boolean isIndex;
  77   private byte[] lastTermBytes = new byte[10];
  78   private int lastTermBytesLength = 0;
  79   private int lastFieldNumber = -1;
  80
  81   private TermInfosWriter other;
  82   private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();
  83
  84   TermInfosWriter(Directory directory, String segment, FieldInfos fis,
  85                   int interval)
  86        throws IOException {
  87     initialize(directory, segment, fis, interval, false);
  88     boolean success = false;
  89     try {
  90       other = new TermInfosWriter(directory, segment, fis, interval, true);
  91       other.other = this;
  92       success = true;
  93     } finally {
  94       if (!success) {
  95         IOUtils.closeWhileHandlingException(output, other);
  96       }
  97     }
  98   }
  99
 100   private TermInfosWriter(Directory directory, String segment, FieldInfos fis,
 101                           int interval, boolean isIndex) throws IOException {
 102     initialize(directory, segment, fis, interval, isIndex);
 103   }
 104
 105   private void initialize(Directory directory, String segment, FieldInfos fis,
 106                           int interval, boolean isi) throws IOException {
 107     indexInterval = interval;
 108     fieldInfos = fis;
 109     isIndex = isi;
 110     output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis"));
 111     boolean success = false;
 112     try {
 113       output.writeInt(FORMAT_CURRENT);              // write format
 114       output.writeLong(0);                          // leave space for size
 115       output.writeInt(indexInterval);               // write indexInterval
 116       output.writeInt(skipInterval);                // write skipInterval
 117       output.writeInt(maxSkipLevels);               // write maxSkipLevels
 118       assert initUTF16Results();
 119       success = true;
 120     } finally {
 121       if (!success) {
 122         IOUtils.closeWhileHandlingException(output);
 123       }
 124     }
 125   }
 126
 127   void add(Term term, TermInfo ti) throws IOException {
 128     UnicodeUtil.UTF16toUTF8(term.text, 0, term.text.length(), utf8Result);
 129     add(fieldInfos.fieldNumber(term.field), utf8Result.result, utf8Result.length, ti);
 130   }
 131
 132   // Currently used only by assert statements
 133   UnicodeUtil.UTF16Result utf16Result1;
 134   UnicodeUtil.UTF16Result utf16Result2;
 135
 136   // Currently used only by assert statements
 137   private boolean initUTF16Results() {
 138     utf16Result1 = new UnicodeUtil.UTF16Result();
 139     utf16Result2 = new UnicodeUtil.UTF16Result();
 140     return true;
 141   }
 142
 143   // Currently used only by assert statement
 144   private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) {
 145
 146     if (lastFieldNumber != fieldNumber) {
 147       final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber));
 148       // If there is a field named "" (empty string) then we
 149       // will get 0 on this comparison, yet, it's "OK".  But
 150       // it's not OK if two different field numbers map to
 151       // the same name.
 152       if (cmp != 0 || lastFieldNumber != -1)
 153         return cmp;
 154     }
 155
 156     UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1);
 157     UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2);
 158     final int len;
 159     if (utf16Result1.length < utf16Result2.length)
 160       len = utf16Result1.length;
 161     else
 162       len = utf16Result2.length;
 163
 164     for(int i=0;i<len;i++) {
 165       final char ch1 = utf16Result1.result[i];
 166       final char ch2 = utf16Result2.result[i];
 167       if (ch1 != ch2)
 168         return ch1-ch2;
 169     }
 170     return utf16Result1.length - utf16Result2.length;
 171   }
 172
 173   /** Adds a new <<fieldNumber, termBytes>, TermInfo> pair to the set.
 174     Term must be lexicographically greater than all previous Terms added.
 175     TermInfo pointers must be positive and greater than all previous.*/
 176   void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti)
 177     throws IOException {
 178
 179     assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 ||
 180       (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) :
 181       "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" +
 182         " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
 183         " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8");
 184
 185     assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")";
 186     assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")";
 187
 188     if (!isIndex && size % indexInterval == 0)
 189       other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi);                      // add an index term
 190
 191     writeTerm(fieldNumber, termBytes, termBytesLength);                        // write term
 192
 193     output.writeVInt(ti.docFreq);                       // write doc freq
 194     output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
 195     output.writeVLong(ti.proxPointer - lastTi.proxPointer);
 196
 197     if (ti.docFreq >= skipInterval) {
 198       output.writeVInt(ti.skipOffset);
 199     }
 200
 201     if (isIndex) {
 202       output.writeVLong(other.output.getFilePointer() - lastIndexPointer);
 203       lastIndexPointer = other.output.getFilePointer(); // write pointer
 204     }
 205
 206     lastFieldNumber = fieldNumber;
 207     lastTi.set(ti);
 208     size++;
 209   }
 210
 211   private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
 212        throws IOException {
 213
 214     // TODO: UTF16toUTF8 could tell us this prefix
 215     // Compute prefix in common with last term:
 216     int start = 0;
 217     final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength;
 218     while(start < limit) {
 219       if (termBytes[start] != lastTermBytes[start])
 220         break;
 221       start++;
 222     }
 223
 224     final int length = termBytesLength - start;
 225     output.writeVInt(start);                     // write shared prefix length
 226     output.writeVInt(length);                  // write delta length
 227     output.writeBytes(termBytes, start, length);  // write delta bytes
 228     output.writeVInt(fieldNumber); // write field num
 229     if (lastTermBytes.length < termBytesLength) {
 230       lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength);
 231     }
 232     System.arraycopy(termBytes, start, lastTermBytes, start, length);
 233     lastTermBytesLength = termBytesLength;
 234   }
 235
 236   /** Called to complete TermInfos creation. */
 237   public void close() throws IOException {
 238     try {
 239       output.seek(4);          // write size after format
 240       output.writeLong(size);
 241     } finally {
 242       try {
 243         output.close();
 244       } finally {
 245         if (!isIndex) {
 246           other.close();
 247         }
 248       }
 249     }
 250
 251   }
 252
 253 }