1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import java.io.Closeable;
22 import java.io.IOException;
23 import org.apache.lucene.store.IndexOutput;
24 import org.apache.lucene.store.Directory;
25 import org.apache.lucene.util.IOUtils;
26 import org.apache.lucene.util.UnicodeUtil;
27 import org.apache.lucene.util.ArrayUtil;
30 /** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
31 Directory. A TermInfos can be written once, in order. */
33 final class TermInfosWriter implements Closeable {
34 /** The file format version, a negative number. */
35 public static final int FORMAT = -3;
37 // Changed strings to true utf8 with length-in-bytes not
39 public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4;
41 // NOTE: always change this if you switch to a new format!
42 public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
44 private FieldInfos fieldInfos;
45 private IndexOutput output;
46 private TermInfo lastTi = new TermInfo();
49 // TODO: the default values for these two parameters should be settable from
50 // IndexWriter. However, once that's done, folks will start setting them to
51 // ridiculous values and complaining that things don't work well, as with
52 // mergeFactor. So, let's wait until a number of folks find that alternate
53 // values work better. Note that both of these values are stored in the
54 // segment, so that it's safe to change these w/o rebuilding all indexes.
56 /** Expert: The fraction of terms in the "dictionary" which should be stored
57 * in RAM. Smaller values use more memory, but make searching slightly
58 * faster, while larger values use less memory and make searching slightly
59 * slower. Searching is typically not dominated by dictionary lookup, so
60 * tweaking this is rarely useful.*/
61 int indexInterval = 128;
63 /** Expert: The fraction of {@link TermDocs} entries stored in skip tables,
64 * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in
65 * smaller indexes, greater acceleration, but fewer accelerable cases, while
66 * smaller values result in bigger indexes, less acceleration and more
67 * accelerable cases. More detailed experiments would be useful here. */
68 int skipInterval = 16;
70 /** Expert: The maximum number of skip levels. Smaller values result in
71 * slightly smaller indexes, but slower skipping in big posting lists.
73 int maxSkipLevels = 10;
75 private long lastIndexPointer;
76 private boolean isIndex;
77 private byte[] lastTermBytes = new byte[10];
78 private int lastTermBytesLength = 0;
79 private int lastFieldNumber = -1;
81 private TermInfosWriter other;
82 private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();
84 TermInfosWriter(Directory directory, String segment, FieldInfos fis,
87 initialize(directory, segment, fis, interval, false);
88 boolean success = false;
90 other = new TermInfosWriter(directory, segment, fis, interval, true);
95 IOUtils.closeWhileHandlingException(output, other);
100 private TermInfosWriter(Directory directory, String segment, FieldInfos fis,
101 int interval, boolean isIndex) throws IOException {
102 initialize(directory, segment, fis, interval, isIndex);
105 private void initialize(Directory directory, String segment, FieldInfos fis,
106 int interval, boolean isi) throws IOException {
107 indexInterval = interval;
110 output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis"));
111 boolean success = false;
113 output.writeInt(FORMAT_CURRENT); // write format
114 output.writeLong(0); // leave space for size
115 output.writeInt(indexInterval); // write indexInterval
116 output.writeInt(skipInterval); // write skipInterval
117 output.writeInt(maxSkipLevels); // write maxSkipLevels
118 assert initUTF16Results();
122 IOUtils.closeWhileHandlingException(output);
127 void add(Term term, TermInfo ti) throws IOException {
128 UnicodeUtil.UTF16toUTF8(term.text, 0, term.text.length(), utf8Result);
129 add(fieldInfos.fieldNumber(term.field), utf8Result.result, utf8Result.length, ti);
132 // Currently used only by assert statements
133 UnicodeUtil.UTF16Result utf16Result1;
134 UnicodeUtil.UTF16Result utf16Result2;
136 // Currently used only by assert statements
137 private boolean initUTF16Results() {
138 utf16Result1 = new UnicodeUtil.UTF16Result();
139 utf16Result2 = new UnicodeUtil.UTF16Result();
143 // Currently used only by assert statement
144 private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) {
146 if (lastFieldNumber != fieldNumber) {
147 final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber));
148 // If there is a field named "" (empty string) then we
149 // will get 0 on this comparison, yet, it's "OK". But
150 // it's not OK if two different field numbers map to
152 if (cmp != 0 || lastFieldNumber != -1)
156 UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1);
157 UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2);
159 if (utf16Result1.length < utf16Result2.length)
160 len = utf16Result1.length;
162 len = utf16Result2.length;
164 for(int i=0;i<len;i++) {
165 final char ch1 = utf16Result1.result[i];
166 final char ch2 = utf16Result2.result[i];
170 return utf16Result1.length - utf16Result2.length;
173 /** Adds a new <<fieldNumber, termBytes>, TermInfo> pair to the set.
174 Term must be lexicographically greater than all previous Terms added.
175 TermInfo pointers must be positive and greater than all previous.*/
176 void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti)
179 assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 ||
180 (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) :
181 "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" +
182 " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
183 " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8");
185 assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")";
186 assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")";
188 if (!isIndex && size % indexInterval == 0)
189 other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term
191 writeTerm(fieldNumber, termBytes, termBytesLength); // write term
193 output.writeVInt(ti.docFreq); // write doc freq
194 output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
195 output.writeVLong(ti.proxPointer - lastTi.proxPointer);
197 if (ti.docFreq >= skipInterval) {
198 output.writeVInt(ti.skipOffset);
202 output.writeVLong(other.output.getFilePointer() - lastIndexPointer);
203 lastIndexPointer = other.output.getFilePointer(); // write pointer
206 lastFieldNumber = fieldNumber;
211 private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
214 // TODO: UTF16toUTF8 could tell us this prefix
215 // Compute prefix in common with last term:
217 final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength;
218 while(start < limit) {
219 if (termBytes[start] != lastTermBytes[start])
224 final int length = termBytesLength - start;
225 output.writeVInt(start); // write shared prefix length
226 output.writeVInt(length); // write delta length
227 output.writeBytes(termBytes, start, length); // write delta bytes
228 output.writeVInt(fieldNumber); // write field num
229 if (lastTermBytes.length < termBytesLength) {
230 lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength);
232 System.arraycopy(termBytes, start, lastTermBytes, start, length);
233 lastTermBytesLength = termBytesLength;
236 /** Called to complete TermInfos creation. */
237 public void close() throws IOException {
239 output.seek(4); // write size after format
240 output.writeLong(size);