pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / instantiated / src / java / org / apache / lucene / store / instantiated / InstantiatedIndexReader.java
diff --git a/lucene-java-3.5.0/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java b/lucene-java-3.5.0/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java

new file mode 100644 (file)

index 0000000..ce7d468
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java
@@ -0,0 +1,443 @@
+package org.apache.lucene.store.instantiated;
+
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.index.*;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BitVector;
+
+/**
+ * An InstantiatedIndexReader is not a snapshot in time, it is completely in
+ * sync with the latest commit to the store!
+ * <p>
+ * Consider using InstantiatedIndex as if it was immutable.
+ */
+public class InstantiatedIndexReader extends IndexReader {
+
+  private final InstantiatedIndex index;
+
+  public InstantiatedIndexReader(InstantiatedIndex index) {
+    super();
+    this.index = index;
+    readerFinishedListeners = Collections.synchronizedSet(new HashSet<ReaderFinishedListener>());
+  }
+
+  @Deprecated
+  @Override
+  public boolean isOptimized() {
+    return true;
+  }
+
+  /**
+   * An InstantiatedIndexReader is not a snapshot in time, it is completely in
+   * sync with the latest commit to the store!
+   * 
+   * @return output from {@link InstantiatedIndex#getVersion()} in associated instantiated index.
+   */
+  @Override
+  public long getVersion() {
+    return index.getVersion();
+  }
+
+  @Override
+  public Directory directory() {
+    throw new UnsupportedOperationException();
+  }
+
+  /**
+   * An InstantiatedIndexReader is always current!
+   * 
+   * Check whether this IndexReader is still using the current (i.e., most
+   * recently committed) version of the index. If a writer has committed any
+   * changes to the index since this reader was opened, this will return
+   * <code>false</code>, in which case you must open a new IndexReader in
+   * order to see the changes. See the description of the <a
+   * href="IndexWriter.html#autoCommit"><code>autoCommit</code></a> flag
+   * which controls when the {@link IndexWriter} actually commits changes to the
+   * index.
+   * 
+   * @return always true
+   * @throws CorruptIndexException if the index is corrupt
+   * @throws IOException if there is a low-level IO error
+   * @throws UnsupportedOperationException unless overridden in subclass
+   */
+  @Override
+  public boolean isCurrent() throws IOException {
+    return true;
+  }
+
+  public InstantiatedIndex getIndex() {
+    return index;
+  }
+
+  private BitVector uncommittedDeletedDocuments;
+
+  private Map<String,List<NormUpdate>> uncommittedNormsByFieldNameAndDocumentNumber = null;
+
+  private class NormUpdate {
+    private int doc;
+    private byte value;
+
+    public NormUpdate(int doc, byte value) {
+      this.doc = doc;
+      this.value = value;
+    }
+  }
+
+  @Override
+  public int numDocs() {
+    // todo i suppose this value could be cached, but array#length and bitvector#count is fast.
+    int numDocs = getIndex().getDocumentsByNumber().length;
+    if (uncommittedDeletedDocuments != null) {
+      numDocs -= uncommittedDeletedDocuments.count();
+    }
+    if (index.getDeletedDocuments() != null) {
+      numDocs -= index.getDeletedDocuments().count();
+    }
+    return numDocs;
+  }
+
+  @Override
+  public int maxDoc() {
+    return getIndex().getDocumentsByNumber().length;
+  }
+
+  @Override
+  public boolean hasDeletions() {
+    return index.getDeletedDocuments() != null || uncommittedDeletedDocuments != null;
+  }
+
+
+  @Override
+  public boolean isDeleted(int n) {
+    return (index.getDeletedDocuments() != null && index.getDeletedDocuments().get(n))
+        || (uncommittedDeletedDocuments != null && uncommittedDeletedDocuments.get(n));
+  }
+
+
+  @Override
+  protected void doDelete(int docNum) throws IOException {
+
+    // dont delete if already deleted
+    if ((index.getDeletedDocuments() != null && index.getDeletedDocuments().get(docNum))
+        || (uncommittedDeletedDocuments != null && uncommittedDeletedDocuments.get(docNum))) {
+      return;
+    }
+
+    if (uncommittedDeletedDocuments == null) {
+      uncommittedDeletedDocuments = new BitVector(maxDoc());
+    }
+
+    uncommittedDeletedDocuments.set(docNum);
+  }
+
+  @Override
+  protected void doUndeleteAll() throws IOException {
+    // todo: read/write lock
+    uncommittedDeletedDocuments = null;
+    // todo: read/write unlock
+  }
+
+  @Override
+  protected void doCommit(Map<String,String> commitUserData) throws IOException {
+    // todo: read/write lock
+
+    // 1. update norms
+    if (uncommittedNormsByFieldNameAndDocumentNumber != null) {
+      for (Map.Entry<String,List<NormUpdate>> e : uncommittedNormsByFieldNameAndDocumentNumber.entrySet()) {
+        byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(e.getKey());
+        for (NormUpdate normUpdate : e.getValue()) {
+          norms[normUpdate.doc] = normUpdate.value;
+        }
+      }
+      uncommittedNormsByFieldNameAndDocumentNumber = null;
+    }
+
+    // 2. remove deleted documents
+    if (uncommittedDeletedDocuments != null) {
+      if (index.getDeletedDocuments() == null) {
+        index.setDeletedDocuments(uncommittedDeletedDocuments);
+      } else {
+        for (int d = 0; d< uncommittedDeletedDocuments.size(); d++) {
+          if (uncommittedDeletedDocuments.get(d)) {
+            index.getDeletedDocuments().set(d);
+          }
+        }
+      }
+      uncommittedDeletedDocuments = null;
+    }
+
+    // todo unlock read/writelock
+  }
+
+  @Override
+  protected void doClose() throws IOException {
+    // ignored
+    // todo perhaps release all associated instances?
+  }
+
+  @Override
+  public Collection<String> getFieldNames(FieldOption fieldOption) {
+    Set<String> fieldSet = new HashSet<String>();
+    for (FieldSetting fi : index.getFieldSettings().values()) {
+      if (fieldOption == IndexReader.FieldOption.ALL) {
+        fieldSet.add(fi.fieldName);
+      } else if (!fi.indexed && fieldOption == IndexReader.FieldOption.UNINDEXED) {
+        fieldSet.add(fi.fieldName);
+      } else if (fi.storePayloads && fieldOption == IndexReader.FieldOption.STORES_PAYLOADS) {
+        fieldSet.add(fi.fieldName);
+      } else if (fi.indexed && fieldOption == IndexReader.FieldOption.INDEXED) {
+        fieldSet.add(fi.fieldName);
+      } else if (fi.indexed && fi.storeTermVector == false && fieldOption == IndexReader.FieldOption.INDEXED_NO_TERMVECTOR) {
+        fieldSet.add(fi.fieldName);
+      } else if (fi.storeTermVector == true && fi.storePositionWithTermVector == false && fi.storeOffsetWithTermVector == false
+          && fieldOption == IndexReader.FieldOption.TERMVECTOR) {
+        fieldSet.add(fi.fieldName);
+      } else if (fi.indexed && fi.storeTermVector && fieldOption == IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR) {
+        fieldSet.add(fi.fieldName);
+      } else if (fi.storePositionWithTermVector && fi.storeOffsetWithTermVector == false
+          && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_POSITION) {
+        fieldSet.add(fi.fieldName);
+      } else if (fi.storeOffsetWithTermVector && fi.storePositionWithTermVector == false
+          && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET) {
+        fieldSet.add(fi.fieldName);
+      } else if ((fi.storeOffsetWithTermVector && fi.storePositionWithTermVector)
+          && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET) {
+        fieldSet.add(fi.fieldName);
+      } 
+    }
+    return fieldSet;
+  }
+
+  /**
+   * Return the {@link org.apache.lucene.document.Document} at the <code>n</code><sup>th</sup>
+   * position.
+     <p>
+   * <b>Warning!</b>
+   * The resulting document is the actual stored document instance
+   * and not a deserialized clone as retuned by an IndexReader
+   * over a {@link org.apache.lucene.store.Directory}.
+   * I.e., if you need to touch the document, clone it first!
+   * <p>
+   * This can also be seen as a feature for live changes of stored values,
+   * but be careful! Adding a field with an name unknown to the index
+   * or to a field with previously no stored values will make
+   * {@link org.apache.lucene.store.instantiated.InstantiatedIndexReader#getFieldNames(org.apache.lucene.index.IndexReader.FieldOption)}
+   * out of sync, causing problems for instance when merging the
+   * instantiated index to another index.
+     <p>
+   * This implementation ignores the field selector! All stored fields are always returned!
+   * <p>
+   *
+   * @param n document number
+   * @param fieldSelector ignored
+   * @return The stored fields of the {@link org.apache.lucene.document.Document} at the nth position
+   * @throws CorruptIndexException if the index is corrupt
+   * @throws IOException if there is a low-level IO error
+   * 
+   * @see org.apache.lucene.document.Fieldable
+   * @see org.apache.lucene.document.FieldSelector
+   * @see org.apache.lucene.document.SetBasedFieldSelector
+   * @see org.apache.lucene.document.LoadFirstFieldSelector
+   */
+  @Override
+  public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
+    return document(n);
+  }
+
+  /**
+   * Returns the stored fields of the <code>n</code><sup>th</sup>
+   * <code>Document</code> in this index.
+   * <p>
+   * <b>Warning!</b>
+   * The resulting document is the actual stored document instance
+   * and not a deserialized clone as retuned by an IndexReader
+   * over a {@link org.apache.lucene.store.Directory}.
+   * I.e., if you need to touch the document, clone it first!
+   * <p>
+   * This can also be seen as a feature for live changes of stored values,
+   * but be careful! Adding a field with an name unknown to the index
+   * or to a field with previously no stored values will make
+   * {@link org.apache.lucene.store.instantiated.InstantiatedIndexReader#getFieldNames(org.apache.lucene.index.IndexReader.FieldOption)}
+   * out of sync, causing problems for instance when merging the
+   * instantiated index to another index.
+   *
+   * @throws CorruptIndexException if the index is corrupt
+   * @throws IOException if there is a low-level IO error
+   */
+
+  @Override
+  public Document document(int n) throws IOException {
+    return isDeleted(n) ? null : getIndex().getDocumentsByNumber()[n].getDocument();
+  }
+
+  /**
+   * never ever touch these values. it is the true values, unless norms have
+   * been touched.
+   */
+  @Override
+  public byte[] norms(String field) throws IOException {
+    byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(field);
+    if (norms == null) {
+      return new byte[0]; // todo a static final zero length attribute?
+    }
+    if (uncommittedNormsByFieldNameAndDocumentNumber != null) {
+      norms = norms.clone();
+      List<NormUpdate> updated = uncommittedNormsByFieldNameAndDocumentNumber.get(field);
+      if (updated != null) {
+        for (NormUpdate normUpdate : updated) {
+          norms[normUpdate.doc] = normUpdate.value;
+        }
+      }
+    }
+    return norms;
+  }
+
+  @Override
+  public void norms(String field, byte[] bytes, int offset) throws IOException {
+    byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(field);
+    if (norms == null) {
+      return;
+    }
+    System.arraycopy(norms, 0, bytes, offset, norms.length);
+  }
+
+  @Override
+  protected void doSetNorm(int doc, String field, byte value) throws IOException {
+    if (uncommittedNormsByFieldNameAndDocumentNumber == null) {
+      uncommittedNormsByFieldNameAndDocumentNumber = new HashMap<String,List<NormUpdate>>(getIndex().getNormsByFieldNameAndDocumentNumber().size());
+    }
+    List<NormUpdate> list = uncommittedNormsByFieldNameAndDocumentNumber.get(field);
+    if (list == null) {
+      list = new LinkedList<NormUpdate>();
+      uncommittedNormsByFieldNameAndDocumentNumber.put(field, list);
+    }
+    list.add(new NormUpdate(doc, value));
+  }
+
+  @Override
+  public int docFreq(Term t) throws IOException {
+    InstantiatedTerm term = getIndex().findTerm(t);
+    if (term == null) {
+      return 0;
+    } else {
+      return term.getAssociatedDocuments().length;
+    }
+  }
+
+  @Override
+  public TermEnum terms() throws IOException {
+    return new InstantiatedTermEnum(this);
+  }
+
+  @Override
+  public TermEnum terms(Term t) throws IOException {
+    InstantiatedTerm it = getIndex().findTerm(t);
+    if (it != null) {
+      return new InstantiatedTermEnum(this, it.getTermIndex());
+    } else {
+      int startPos = Arrays.binarySearch(index.getOrderedTerms(), t, InstantiatedTerm.termComparator);
+      if (startPos < 0) {
+        startPos = -1 - startPos;
+      }
+      return new InstantiatedTermEnum(this, startPos);
+    }
+  }
+
+  @Override
+  public TermDocs termDocs() throws IOException {
+    return new InstantiatedTermDocs(this);
+  }
+
+
+  @Override
+  public TermDocs termDocs(Term term) throws IOException {
+    if (term == null) {
+      return new InstantiatedAllTermDocs(this);
+    } else {
+      InstantiatedTermDocs termDocs = new InstantiatedTermDocs(this);
+      termDocs.seek(term);
+      return termDocs;
+    }
+  }
+
+  @Override
+  public TermPositions termPositions() throws IOException {
+    return new InstantiatedTermPositions(this);
+  }
+
+  @Override
+  public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException {
+    InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber];
+    if (doc.getVectorSpace() == null) {
+      return null;
+    }
+    TermFreqVector[] ret = new TermFreqVector[doc.getVectorSpace().size()];
+    Iterator<String> it = doc.getVectorSpace().keySet().iterator();
+    for (int i = 0; i < ret.length; i++) {
+      ret[i] = new InstantiatedTermPositionVector(getIndex().getDocumentsByNumber()[docNumber], it.next());
+    }
+    return ret;
+  }
+
+  @Override
+  public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException {
+    InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber];
+    if (doc.getVectorSpace() == null || doc.getVectorSpace().get(field) == null) {
+      return null;
+    } else {
+      return new InstantiatedTermPositionVector(doc, field);
+    }
+  }
+
+  @Override
+  public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
+    InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber];
+    if (doc.getVectorSpace() != null && doc.getVectorSpace().get(field) == null) {
+      List<InstantiatedTermDocumentInformation> tv = doc.getVectorSpace().get(field);
+      mapper.setExpectations(field, tv.size(), true, true);
+      for (InstantiatedTermDocumentInformation tdi : tv) {
+        mapper.map(tdi.getTerm().text(), tdi.getTermPositions().length, tdi.getTermOffsets(), tdi.getTermPositions());
+      }
+    }
+  }
+
+  @Override
+  public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
+    InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber];
+    for (Map.Entry<String, List<InstantiatedTermDocumentInformation>> e : doc.getVectorSpace().entrySet()) {
+      mapper.setExpectations(e.getKey(), e.getValue().size(), true, true);
+      for (InstantiatedTermDocumentInformation tdi : e.getValue()) {
+        mapper.map(tdi.getTerm().text(), tdi.getTermPositions().length, tdi.getTermOffsets(), tdi.getTermPositions());
+      }
+    }
+  }
+}