pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / misc / src / java / org / apache / lucene / index / TermVectorAccessor.java
diff --git a/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/index/TermVectorAccessor.java b/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/index/TermVectorAccessor.java

new file mode 100644 (file)

index 0000000..293c37a
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/index/TermVectorAccessor.java
@@ -0,0 +1,170 @@
+package org.apache.lucene.index;
+
+import org.apache.lucene.util.StringHelper;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+/*
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+
+/**
+ * Transparent access to the vector space model,
+ * either via TermFreqVector or by resolving it from the inverted index.
+ * <p/>
+ * Resolving a term vector from a large index can be a time consuming process.
+ * <p/>
+ * Warning! This class is not thread safe!
+ */
+public class TermVectorAccessor {
+
+  public TermVectorAccessor() {
+  }
+
+  /**
+   * Instance reused to save garbage collector some time
+   */
+  private TermVectorMapperDecorator decoratedMapper = new TermVectorMapperDecorator();
+
+
+  /**
+   * Visits the TermVectorMapper and populates it with terms available for a given document,
+   * either via a vector created at index time or by resolving them from the inverted index.
+   *
+   * @param indexReader    Index source
+   * @param documentNumber Source document to access
+   * @param fieldName      Field to resolve
+   * @param mapper         Mapper to be mapped with data
+   * @throws IOException
+   */
+  public void accept(IndexReader indexReader, int documentNumber, String fieldName, TermVectorMapper mapper) throws IOException {
+
+    fieldName = StringHelper.intern(fieldName);
+
+    decoratedMapper.decorated = mapper;
+    decoratedMapper.termVectorStored = false;
+
+    indexReader.getTermFreqVector(documentNumber, fieldName, decoratedMapper);
+
+    if (!decoratedMapper.termVectorStored) {
+      mapper.setDocumentNumber(documentNumber);
+      build(indexReader, fieldName, mapper, documentNumber);
+    }
+  }
+
+  /** Instance reused to save garbage collector some time */
+  private List<String> tokens;
+
+  /** Instance reused to save garbage collector some time */
+  private List<int[]> positions;
+
+  /** Instance reused to save garbage collector some time */
+  private List<Integer> frequencies;
+
+
+  /**
+   * Populates the mapper with terms available for the given field in a document
+   * by resolving the inverted index.
+   *
+   * @param indexReader
+   * @param field interned field name
+   * @param mapper
+   * @param documentNumber
+   * @throws IOException
+   */
+  private void build(IndexReader indexReader, String field, TermVectorMapper mapper, int documentNumber) throws IOException {
+
+    if (tokens == null) {
+      tokens = new ArrayList<String>(500);
+      positions = new ArrayList<int[]>(500);
+      frequencies = new ArrayList<Integer>(500);
+    } else {
+      tokens.clear();
+      frequencies.clear();
+      positions.clear();
+    }
+
+    TermEnum termEnum = indexReader.terms(new Term(field, ""));
+    if (termEnum.term() != null) {
+      while (termEnum.term().field() == field) {
+        TermPositions termPositions = indexReader.termPositions(termEnum.term());
+        if (termPositions.skipTo(documentNumber)) {
+  
+          frequencies.add(Integer.valueOf(termPositions.freq()));
+          tokens.add(termEnum.term().text());
+  
+  
+          if (!mapper.isIgnoringPositions()) {
+            int[] positions = new int[termPositions.freq()];
+            for (int i = 0; i < positions.length; i++) {
+              positions[i] = termPositions.nextPosition();
+            }
+            this.positions.add(positions);
+          } else {
+            positions.add(null);
+          }
+        }
+        termPositions.close();
+        if (!termEnum.next()) {
+          break;
+        }
+      }
+      mapper.setDocumentNumber(documentNumber);
+      mapper.setExpectations(field, tokens.size(), false, !mapper.isIgnoringPositions());
+      for (int i = 0; i < tokens.size(); i++) {
+        mapper.map(tokens.get(i), frequencies.get(i).intValue(), (TermVectorOffsetInfo[]) null, positions.get(i));
+      }
+    }
+    termEnum.close();
+
+
+  }
+
+
+  private static class TermVectorMapperDecorator extends TermVectorMapper {
+
+    private TermVectorMapper decorated;
+
+    @Override
+    public boolean isIgnoringPositions() {
+      return decorated.isIgnoringPositions();
+    }
+
+    @Override
+    public boolean isIgnoringOffsets() {
+      return decorated.isIgnoringOffsets();
+    }
+
+    private boolean termVectorStored = false;
+
+    @Override
+    public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
+      decorated.setExpectations(field, numTerms, storeOffsets, storePositions);
+      termVectorStored = true;
+    }
+
+    @Override
+    public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
+      decorated.map(term, frequency, offsets, positions);
+    }
+
+    @Override
+    public void setDocumentNumber(int documentNumber) {
+      decorated.setDocumentNumber(documentNumber);
+    }
+  }
+
+}