1 package org.apache.lucene.store.instantiated;
4 * Copyright 2006 The Apache Software Foundation
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
19 import java.io.Closeable;
20 import java.io.IOException;
21 import java.io.Serializable;
22 import java.util.ArrayList;
23 import java.util.Collection;
24 import java.util.HashMap;
25 import java.util.List;
29 import org.apache.lucene.analysis.Analyzer;
30 import org.apache.lucene.document.Document;
31 import org.apache.lucene.document.Fieldable;
32 import org.apache.lucene.index.IndexReader;
33 import org.apache.lucene.index.Term;
34 import org.apache.lucene.index.TermEnum;
35 import org.apache.lucene.index.TermPositionVector;
36 import org.apache.lucene.index.TermPositions;
37 import org.apache.lucene.util.BitVector;
40 * Represented as a coupled graph of class instances, this
41 * all-in-memory index store implementation delivers search
42 * results up to a 100 times faster than the file-centric RAMDirectory
43 * at the cost of greater RAM consumption.
45 * @lucene.experimental
47 * There are no read and write locks in this store.
48 * {@link InstantiatedIndexReader} {@link InstantiatedIndexReader#isCurrent()} all the time
49 * and {@link org.apache.lucene.store.instantiated.InstantiatedIndexWriter}
50 * will attempt to update instances of the object graph in memory
51 * at the same time as a searcher is reading from it.
53 * Consider using InstantiatedIndex as if it was immutable.
55 public class InstantiatedIndex
56 implements Serializable,Closeable {
58 private static final long serialVersionUID = 1l;
60 private long version = System.currentTimeMillis();
62 private InstantiatedDocument[] documentsByNumber;
64 private BitVector deletedDocuments;
66 private Map<String, Map<String, InstantiatedTerm>> termsByFieldAndText;
67 private InstantiatedTerm[] orderedTerms;
69 private Map<String, byte[]> normsByFieldNameAndDocumentNumber;
71 private FieldSettings fieldSettings;
74 * Creates an empty instantiated index for you to fill with data using an {@link org.apache.lucene.store.instantiated.InstantiatedIndexWriter}.
76 public InstantiatedIndex() {
81 // todo: clear index without loosing memory (uncouple stuff)
82 termsByFieldAndText = new HashMap<String, Map<String, InstantiatedTerm>>();
83 fieldSettings = new FieldSettings();
84 orderedTerms = new InstantiatedTerm[0];
85 documentsByNumber = new InstantiatedDocument[0];
86 normsByFieldNameAndDocumentNumber = new HashMap<String, byte[]>();
91 * Creates a new instantiated index that looks just like the index in a specific state as represented by a reader.
93 * @param sourceIndexReader the source index this new instantiated index will be copied from.
94 * @throws IOException if the source index is not optimized, or when accessing the source.
96 public InstantiatedIndex(IndexReader sourceIndexReader) throws IOException {
97 this(sourceIndexReader, null);
103 * Creates a new instantiated index that looks just like the index in a specific state as represented by a reader.
105 * @param sourceIndexReader the source index this new instantiated index will be copied from.
106 * @param fields fields to be added, or null for all
107 * @throws IOException if the source index is not optimized, or when accessing the source.
109 public InstantiatedIndex(IndexReader sourceIndexReader, Set<String> fields) throws IOException {
111 if (!sourceIndexReader.isOptimized()) {
112 System.out.println(("Source index is not optimized."));
113 //throw new IOException("Source index is not optimized.");
119 Collection<String> allFieldNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.ALL);
121 // load field options
123 Collection<String> indexedNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.INDEXED);
124 for (String name : indexedNames) {
125 FieldSetting setting = fieldSettings.get(name, true);
126 setting.indexed = true;
128 Collection<String> indexedNoVecNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.INDEXED_NO_TERMVECTOR);
129 for (String name : indexedNoVecNames) {
130 FieldSetting setting = fieldSettings.get(name, true);
131 setting.storeTermVector = false;
132 setting.indexed = true;
134 Collection<String> indexedVecNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR);
135 for (String name : indexedVecNames) {
136 FieldSetting setting = fieldSettings.get(name, true);
137 setting.storeTermVector = true;
138 setting.indexed = true;
140 Collection<String> payloadNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS);
141 for (String name : payloadNames) {
142 FieldSetting setting = fieldSettings.get(name, true);
143 setting.storePayloads = true;
145 Collection<String> termVecNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR);
146 for (String name : termVecNames) {
147 FieldSetting setting = fieldSettings.get(name, true);
148 setting.storeTermVector = true;
150 Collection<String> termVecOffsetNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET);
151 for (String name : termVecOffsetNames) {
152 FieldSetting setting = fieldSettings.get(name, true);
153 setting.storeOffsetWithTermVector = true;
155 Collection<String> termVecPosNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION);
156 for (String name : termVecPosNames) {
157 FieldSetting setting = fieldSettings.get(name, true);
158 setting.storePositionWithTermVector = true;
160 Collection<String> termVecPosOffNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET);
161 for (String name : termVecPosOffNames) {
162 FieldSetting setting = fieldSettings.get(name, true);
163 setting.storeOffsetWithTermVector = true;
164 setting.storePositionWithTermVector = true;
166 Collection<String> unindexedNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.UNINDEXED);
167 for (String name : unindexedNames) {
168 FieldSetting setting = fieldSettings.get(name, true);
169 setting.indexed = false;
173 documentsByNumber = new InstantiatedDocument[sourceIndexReader.maxDoc()];
175 if (sourceIndexReader.hasDeletions()) {
176 deletedDocuments = new BitVector(sourceIndexReader.maxDoc());
180 for (int i = 0; i < sourceIndexReader.maxDoc(); i++) {
181 if (sourceIndexReader.hasDeletions() && sourceIndexReader.isDeleted(i)) {
182 deletedDocuments.set(i);
184 InstantiatedDocument document = new InstantiatedDocument();
185 // copy stored fields from source reader
186 Document sourceDocument = sourceIndexReader.document(i);
187 for (Fieldable field : sourceDocument.getFields()) {
188 if (fields == null || fields.contains(field.name())) {
189 document.getDocument().add(field);
192 document.setDocumentNumber(i);
193 documentsByNumber[i] = document;
194 for (Fieldable field : document.getDocument().getFields()) {
195 if (fields == null || fields.contains(field.name())) {
196 if (field.isTermVectorStored()) {
197 if (document.getVectorSpace() == null) {
198 document.setVectorSpace(new HashMap<String, List<InstantiatedTermDocumentInformation>>());
200 document.getVectorSpace().put(field.name(), new ArrayList<InstantiatedTermDocumentInformation>());
210 for (String fieldName : allFieldNames) {
211 if (fields == null || fields.contains(fieldName)) {
212 getNormsByFieldNameAndDocumentNumber().put(fieldName, sourceIndexReader.norms(fieldName));
217 for (String fieldName : allFieldNames) {
218 if (fields == null || fields.contains(fieldName)) {
219 getTermsByFieldAndText().put(fieldName, new HashMap<String, InstantiatedTerm>(5000));
222 List<InstantiatedTerm> terms = new ArrayList<InstantiatedTerm>(5000 * getTermsByFieldAndText().size());
223 TermEnum termEnum = sourceIndexReader.terms();
224 while (termEnum.next()) {
225 if (fields == null || fields.contains(termEnum.term().field())) { // todo skipto if not using field
226 InstantiatedTerm instantiatedTerm = new InstantiatedTerm(termEnum.term().field(), termEnum.term().text());
227 getTermsByFieldAndText().get(termEnum.term().field()).put(termEnum.term().text(), instantiatedTerm);
228 instantiatedTerm.setTermIndex(terms.size());
229 terms.add(instantiatedTerm);
230 instantiatedTerm.setAssociatedDocuments(new InstantiatedTermDocumentInformation[termEnum.docFreq()]);
234 orderedTerms = terms.toArray(new InstantiatedTerm[terms.size()]);
236 // create term-document informations
237 for (InstantiatedTerm term : orderedTerms) {
238 TermPositions termPositions = sourceIndexReader.termPositions(term.getTerm());
240 while (termPositions.next()) {
241 InstantiatedDocument document = documentsByNumber[termPositions.doc()];
243 byte[][] payloads = new byte[termPositions.freq()][];
244 int[] positions = new int[termPositions.freq()];
245 for (int i = 0; i < termPositions.freq(); i++) {
246 positions[i] = termPositions.nextPosition();
248 if (termPositions.isPayloadAvailable()) {
249 payloads[i] = new byte[termPositions.getPayloadLength()];
250 termPositions.getPayload(payloads[i], 0);
254 InstantiatedTermDocumentInformation termDocumentInformation = new InstantiatedTermDocumentInformation(term, document, positions, payloads);
255 term.getAssociatedDocuments()[position++] = termDocumentInformation;
257 if (document.getVectorSpace() != null
258 && document.getVectorSpace().containsKey(term.field())) {
259 document.getVectorSpace().get(term.field()).add(termDocumentInformation);
262 // termDocumentInformation.setIndexFromTerm(indexFromTerm++);
266 // load offsets to term-document informations
267 for (InstantiatedDocument document : getDocumentsByNumber()) {
268 if (document == null) {
271 for (Fieldable field : document.getDocument().getFields()) {
272 if (field.isTermVectorStored() && field.isStoreOffsetWithTermVector()) {
273 TermPositionVector termPositionVector = (TermPositionVector) sourceIndexReader.getTermFreqVector(document.getDocumentNumber(), field.name());
274 if (termPositionVector != null) {
275 for (int i = 0; i < termPositionVector.getTerms().length; i++) {
276 String token = termPositionVector.getTerms()[i];
277 InstantiatedTerm term = findTerm(field.name(), token);
278 InstantiatedTermDocumentInformation termDocumentInformation = term.getAssociatedDocument(document.getDocumentNumber());
279 termDocumentInformation.setTermOffsets(termPositionVector.getOffsets(i));
287 public InstantiatedIndexWriter indexWriterFactory(Analyzer analyzer, boolean create) throws IOException {
288 return new InstantiatedIndexWriter(this, analyzer, create);
291 public InstantiatedIndexReader indexReaderFactory() throws IOException {
292 return new InstantiatedIndexReader(this);
295 public void close() throws IOException {
296 // todo: decouple everything
299 InstantiatedTerm findTerm(Term term) {
300 return findTerm(term.field(), term.text());
303 InstantiatedTerm findTerm(String field, String text) {
304 Map<String, InstantiatedTerm> termsByField = termsByFieldAndText.get(field);
305 if (termsByField == null) {
308 return termsByField.get(text);
312 public Map<String, Map<String, InstantiatedTerm>> getTermsByFieldAndText() {
313 return termsByFieldAndText;
317 public InstantiatedTerm[] getOrderedTerms() {
321 public InstantiatedDocument[] getDocumentsByNumber() {
322 return documentsByNumber;
325 public Map<String, byte[]> getNormsByFieldNameAndDocumentNumber() {
326 return normsByFieldNameAndDocumentNumber;
329 void setNormsByFieldNameAndDocumentNumber(Map<String, byte[]> normsByFieldNameAndDocumentNumber) {
330 this.normsByFieldNameAndDocumentNumber = normsByFieldNameAndDocumentNumber;
333 public BitVector getDeletedDocuments() {
334 return deletedDocuments;
337 void setDeletedDocuments(BitVector deletedDocuments) {
338 this.deletedDocuments = deletedDocuments;
341 void setOrderedTerms(InstantiatedTerm[] orderedTerms) {
342 this.orderedTerms = orderedTerms;
345 void setDocumentsByNumber(InstantiatedDocument[] documentsByNumber) {
346 this.documentsByNumber = documentsByNumber;
350 public long getVersion() {
354 void setVersion(long version) {
355 this.version = version;
359 FieldSettings getFieldSettings() {
360 return fieldSettings;