lucene-java-3.4.0/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java

   1 package org.apache.lucene.index;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.Reader;
  22 import org.apache.lucene.document.Fieldable;
  23 import org.apache.lucene.analysis.TokenStream;
  24 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  25 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  26
  27 /**
  28  * Holds state for inverting all occurrences of a single
  29  * field in the document.  This class doesn't do anything
  30  * itself; instead, it forwards the tokens produced by
  31  * analysis to its own consumer
  32  * (InvertedDocConsumerPerField).  It also interacts with an
  33  * endConsumer (InvertedDocEndConsumerPerField).
  34  */
  35
  36 final class DocInverterPerField extends DocFieldConsumerPerField {
  37
  38   final private DocInverterPerThread perThread;
  39   final private FieldInfo fieldInfo;
  40   final InvertedDocConsumerPerField consumer;
  41   final InvertedDocEndConsumerPerField endConsumer;
  42   final DocumentsWriter.DocState docState;
  43   final FieldInvertState fieldState;
  44
  45   public DocInverterPerField(DocInverterPerThread perThread, FieldInfo fieldInfo) {
  46     this.perThread = perThread;
  47     this.fieldInfo = fieldInfo;
  48     docState = perThread.docState;
  49     fieldState = perThread.fieldState;
  50     this.consumer = perThread.consumer.addField(this, fieldInfo);
  51     this.endConsumer = perThread.endConsumer.addField(this, fieldInfo);
  52   }
  53
  54   @Override
  55   void abort() {
  56     try {
  57       consumer.abort();
  58     } finally {
  59       endConsumer.abort();
  60     }
  61   }
  62
  63   @Override
  64   public void processFields(final Fieldable[] fields,
  65                             final int count) throws IOException {
  66
  67     fieldState.reset(docState.doc.getBoost());
  68
  69     final int maxFieldLength = docState.maxFieldLength;
  70
  71     final boolean doInvert = consumer.start(fields, count);
  72
  73     for(int i=0;i<count;i++) {
  74
  75       final Fieldable field = fields[i];
  76
  77       // TODO FI: this should be "genericized" to querying
  78       // consumer if it wants to see this particular field
  79       // tokenized.
  80       if (field.isIndexed() && doInvert) {
  81
  82         if (i > 0)
  83           fieldState.position += docState.analyzer == null ? 0 : docState.analyzer.getPositionIncrementGap(fieldInfo.name);
  84
  85         if (!field.isTokenized()) {               // un-tokenized field
  86           String stringValue = field.stringValue();
  87           final int valueLength = stringValue.length();
  88           perThread.singleToken.reinit(stringValue, 0, valueLength);
  89           fieldState.attributeSource = perThread.singleToken;
  90           consumer.start(field);
  91
  92           boolean success = false;
  93           try {
  94             consumer.add();
  95             success = true;
  96           } finally {
  97             if (!success)
  98               docState.docWriter.setAborting();
  99           }
 100           fieldState.offset += valueLength;
 101           fieldState.length++;
 102           fieldState.position++;
 103         } else {                                  // tokenized field
 104           final TokenStream stream;
 105           final TokenStream streamValue = field.tokenStreamValue();
 106
 107           if (streamValue != null)
 108             stream = streamValue;
 109           else {
 110             // the field does not have a TokenStream,
 111             // so we have to obtain one from the analyzer
 112             final Reader reader;                          // find or make Reader
 113             final Reader readerValue = field.readerValue();
 114
 115             if (readerValue != null)
 116               reader = readerValue;
 117             else {
 118               String stringValue = field.stringValue();
 119               if (stringValue == null)
 120                 throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
 121               perThread.stringReader.init(stringValue);
 122               reader = perThread.stringReader;
 123             }
 124
 125             // Tokenize field and add to postingTable
 126             stream = docState.analyzer.reusableTokenStream(fieldInfo.name, reader);
 127           }
 128
 129           // reset the TokenStream to the first token
 130           stream.reset();
 131
 132           final int startLength = fieldState.length;
 133
 134           try {
 135             boolean hasMoreTokens = stream.incrementToken();
 136
 137             fieldState.attributeSource = stream;
 138
 139             OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
 140             PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);
 141
 142             consumer.start(field);
 143
 144             for(;;) {
 145
 146               // If we hit an exception in stream.next below
 147               // (which is fairly common, eg if analyzer
 148               // chokes on a given document), then it's
 149               // non-aborting and (above) this one document
 150               // will be marked as deleted, but still
 151               // consume a docID
 152
 153               if (!hasMoreTokens) break;
 154
 155               final int posIncr = posIncrAttribute.getPositionIncrement();
 156               fieldState.position += posIncr;
 157               if (fieldState.position > 0) {
 158                 fieldState.position--;
 159               }
 160
 161               if (posIncr == 0)
 162                 fieldState.numOverlap++;
 163
 164               boolean success = false;
 165               try {
 166                 // If we hit an exception in here, we abort
 167                 // all buffered documents since the last
 168                 // flush, on the likelihood that the
 169                 // internal state of the consumer is now
 170                 // corrupt and should not be flushed to a
 171                 // new segment:
 172                 consumer.add();
 173                 success = true;
 174               } finally {
 175                 if (!success)
 176                   docState.docWriter.setAborting();
 177               }
 178               fieldState.position++;
 179               if (++fieldState.length >= maxFieldLength) {
 180                 if (docState.infoStream != null)
 181                   docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
 182                 break;
 183               }
 184
 185               hasMoreTokens = stream.incrementToken();
 186             }
 187             // trigger streams to perform end-of-stream operations
 188             stream.end();
 189
 190             fieldState.offset += offsetAttribute.endOffset();
 191           } finally {
 192             stream.close();
 193           }
 194         }
 195
 196         fieldState.offset += docState.analyzer == null ? 0 : docState.analyzer.getOffsetGap(field);
 197         fieldState.boost *= field.getBoost();
 198       }
 199
 200       // LUCENE-2387: don't hang onto the field, so GC can
 201       // reclaim
 202       fields[i] = null;
 203     }
 204
 205     consumer.finish();
 206     endConsumer.finish();
 207   }
 208 }