pylucene 3.5.0-3
[pylucene.git] / lucene-java-3.5.0 / lucene / src / java / org / apache / lucene / index / DocInverterPerField.java
diff --git a/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java
new file mode 100644 (file)
index 0000000..9418796
--- /dev/null
@@ -0,0 +1,208 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+/**
+ * Holds state for inverting all occurrences of a single
+ * field in the document.  This class doesn't do anything
+ * itself; instead, it forwards the tokens produced by
+ * analysis to its own consumer
+ * (InvertedDocConsumerPerField).  It also interacts with an
+ * endConsumer (InvertedDocEndConsumerPerField).
+ */
+
+final class DocInverterPerField extends DocFieldConsumerPerField {
+
+  final private DocInverterPerThread perThread;
+  final private FieldInfo fieldInfo;
+  final InvertedDocConsumerPerField consumer;
+  final InvertedDocEndConsumerPerField endConsumer;
+  final DocumentsWriter.DocState docState;
+  final FieldInvertState fieldState;
+
+  public DocInverterPerField(DocInverterPerThread perThread, FieldInfo fieldInfo) {
+    this.perThread = perThread;
+    this.fieldInfo = fieldInfo;
+    docState = perThread.docState;
+    fieldState = perThread.fieldState;
+    this.consumer = perThread.consumer.addField(this, fieldInfo);
+    this.endConsumer = perThread.endConsumer.addField(this, fieldInfo);
+  }
+
+  @Override
+  void abort() {
+    try {
+      consumer.abort();
+    } finally {
+      endConsumer.abort();
+    }
+  }
+
+  @Override
+  public void processFields(final Fieldable[] fields,
+                            final int count) throws IOException {
+
+    fieldState.reset(docState.doc.getBoost());
+
+    final int maxFieldLength = docState.maxFieldLength;
+
+    final boolean doInvert = consumer.start(fields, count);
+
+    for(int i=0;i<count;i++) {
+
+      final Fieldable field = fields[i];
+
+      // TODO FI: this should be "genericized" to querying
+      // consumer if it wants to see this particular field
+      // tokenized.
+      if (field.isIndexed() && doInvert) {
+        
+        if (i > 0)
+          fieldState.position += docState.analyzer == null ? 0 : docState.analyzer.getPositionIncrementGap(fieldInfo.name);
+
+        if (!field.isTokenized()) {              // un-tokenized field
+          String stringValue = field.stringValue();
+          final int valueLength = stringValue.length();
+          perThread.singleToken.reinit(stringValue, 0, valueLength);
+          fieldState.attributeSource = perThread.singleToken;
+          consumer.start(field);
+
+          boolean success = false;
+          try {
+            consumer.add();
+            success = true;
+          } finally {
+            if (!success)
+              docState.docWriter.setAborting();
+          }
+          fieldState.offset += valueLength;
+          fieldState.length++;
+          fieldState.position++;
+        } else {                                  // tokenized field
+          final TokenStream stream;
+          final TokenStream streamValue = field.tokenStreamValue();
+
+          if (streamValue != null) 
+            stream = streamValue;
+          else {
+            // the field does not have a TokenStream,
+            // so we have to obtain one from the analyzer
+            final Reader reader;                         // find or make Reader
+            final Reader readerValue = field.readerValue();
+
+            if (readerValue != null)
+              reader = readerValue;
+            else {
+              String stringValue = field.stringValue();
+              if (stringValue == null)
+                throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
+              perThread.stringReader.init(stringValue);
+              reader = perThread.stringReader;
+            }
+          
+            // Tokenize field and add to postingTable
+            stream = docState.analyzer.reusableTokenStream(fieldInfo.name, reader);
+          }
+
+          // reset the TokenStream to the first token
+          stream.reset();
+
+          final int startLength = fieldState.length;
+          
+          try {
+            boolean hasMoreTokens = stream.incrementToken();
+
+            fieldState.attributeSource = stream;
+
+            OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
+            PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);
+            
+            consumer.start(field);
+            
+            for(;;) {
+
+              // If we hit an exception in stream.next below
+              // (which is fairly common, eg if analyzer
+              // chokes on a given document), then it's
+              // non-aborting and (above) this one document
+              // will be marked as deleted, but still
+              // consume a docID
+              
+              if (!hasMoreTokens) break;
+              
+              final int posIncr = posIncrAttribute.getPositionIncrement();
+              fieldState.position += posIncr;
+              if (fieldState.position > 0) {
+                fieldState.position--;
+              }
+
+              if (posIncr == 0)
+                fieldState.numOverlap++;
+
+              boolean success = false;
+              try {
+                // If we hit an exception in here, we abort
+                // all buffered documents since the last
+                // flush, on the likelihood that the
+                // internal state of the consumer is now
+                // corrupt and should not be flushed to a
+                // new segment:
+                consumer.add();
+                success = true;
+              } finally {
+                if (!success)
+                  docState.docWriter.setAborting();
+              }
+              fieldState.position++;
+              if (++fieldState.length >= maxFieldLength) {
+                if (docState.infoStream != null)
+                  docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
+                break;
+              }
+
+              hasMoreTokens = stream.incrementToken();
+            }
+            // trigger streams to perform end-of-stream operations
+            stream.end();
+            
+            fieldState.offset += offsetAttribute.endOffset();
+          } finally {
+            stream.close();
+          }
+        }
+
+        fieldState.offset += docState.analyzer == null ? 0 : docState.analyzer.getOffsetGap(field);
+        fieldState.boost *= field.getBoost();
+      }
+
+      // LUCENE-2387: don't hang onto the field, so GC can
+      // reclaim
+      fields[i] = null;
+    }
+
+    consumer.finish();
+    endConsumer.finish();
+  }
+}