pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / src / java / org / apache / lucene / index / TermsHashPerField.java
diff --git a/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java

new file mode 100644 (file)

index 0000000..e05ba80
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java
@@ -0,0 +1,581 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.SorterTemplate;
+
+final class TermsHashPerField extends InvertedDocConsumerPerField {
+
+  final TermsHashConsumerPerField consumer;
+
+  final TermsHashPerField nextPerField;
+  final TermsHashPerThread perThread;
+  final DocumentsWriter.DocState docState;
+  final FieldInvertState fieldState;
+  CharTermAttribute termAtt;
+  
+  // Copied from our perThread
+  final CharBlockPool charPool;
+  final IntBlockPool intPool;
+  final ByteBlockPool bytePool;
+
+  final int streamCount;
+  final int numPostingInt;
+
+  final FieldInfo fieldInfo;
+
+  boolean postingsCompacted;
+  int numPostings;
+  private int postingsHashSize = 4;
+  private int postingsHashHalfSize = postingsHashSize/2;
+  private int postingsHashMask = postingsHashSize-1;
+  private int[] postingsHash;
+ 
+  ParallelPostingsArray postingsArray;
+  
+  public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
+    this.perThread = perThread;
+    intPool = perThread.intPool;
+    charPool = perThread.charPool;
+    bytePool = perThread.bytePool;
+    docState = perThread.docState;
+
+    postingsHash = new int[postingsHashSize];
+    Arrays.fill(postingsHash, -1);
+    bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT);
+
+    fieldState = docInverterPerField.fieldState;
+    this.consumer = perThread.consumer.addField(this, fieldInfo);
+    initPostingsArray();
+
+    streamCount = consumer.getStreamCount();
+    numPostingInt = 2*streamCount;
+    this.fieldInfo = fieldInfo;
+    if (nextPerThread != null)
+      nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
+    else
+      nextPerField = null;
+  }
+
+  private void initPostingsArray() {
+    postingsArray = consumer.createPostingsArray(2);
+    bytesUsed(postingsArray.size * postingsArray.bytesPerPosting());
+  }
+
+  // sugar: just forwards to DW
+  private void bytesUsed(long size) {
+    if (perThread.termsHash.trackAllocations) {
+      perThread.termsHash.docWriter.bytesUsed(size);
+    }
+  }
+  
+  void shrinkHash(int targetSize) {
+    assert postingsCompacted || numPostings == 0;
+
+    final int newSize = 4;
+    if (newSize != postingsHash.length) {
+      final long previousSize = postingsHash.length;
+      postingsHash = new int[newSize];
+      bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT);
+      Arrays.fill(postingsHash, -1);
+      postingsHashSize = newSize;
+      postingsHashHalfSize = newSize/2;
+      postingsHashMask = newSize-1;
+    }
+
+    // Fully free the postings array on each flush:
+    if (postingsArray != null) {
+      bytesUsed(-postingsArray.bytesPerPosting() * postingsArray.size);
+      postingsArray = null;
+    }
+  }
+
+  public void reset() {
+    if (!postingsCompacted)
+      compactPostings();
+    assert numPostings <= postingsHash.length;
+    if (numPostings > 0) {
+      Arrays.fill(postingsHash, 0, numPostings, -1);
+      numPostings = 0;
+    }
+    postingsCompacted = false;
+    if (nextPerField != null)
+      nextPerField.reset();
+  }
+
+  @Override
+  synchronized public void abort() {
+    reset();
+    if (nextPerField != null)
+      nextPerField.abort();
+  }
+  
+  private final void growParallelPostingsArray() {
+    int oldSize = postingsArray.size;
+    this.postingsArray = this.postingsArray.grow();
+    bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize));
+  }
+
+  public void initReader(ByteSliceReader reader, int termID, int stream) {
+    assert stream < streamCount;
+    int intStart = postingsArray.intStarts[termID];
+    final int[] ints = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
+    final int upto = intStart & DocumentsWriter.INT_BLOCK_MASK;
+    reader.init(bytePool,
+                postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
+                ints[upto+stream]);
+  }
+
+  private void compactPostings() {
+    int upto = 0;
+    for(int i=0;i<postingsHashSize;i++) {
+      if (postingsHash[i] != -1) {
+        if (upto < i) {
+          postingsHash[upto] = postingsHash[i];
+          postingsHash[i] = -1;
+        }
+        upto++;
+      }
+    }
+
+    assert upto == numPostings: "upto=" + upto + " numPostings=" + numPostings;
+    postingsCompacted = true;
+  }
+
+  /** Collapse the hash table & sort in-place. */
+  public int[] sortPostings() {
+    compactPostings();
+    final int[] postingsHash = this.postingsHash;
+    new SorterTemplate() {
+      @Override
+      protected void swap(int i, int j) {
+        final int o = postingsHash[i];
+        postingsHash[i] = postingsHash[j];
+        postingsHash[j] = o;
+      }
+      
+      @Override
+      protected int compare(int i, int j) {
+        final int term1 = postingsHash[i], term2 = postingsHash[j];
+        if (term1 == term2)
+          return 0;
+        final int textStart1 = postingsArray.textStarts[term1],
+          textStart2 = postingsArray.textStarts[term2];
+        final char[] text1 = charPool.buffers[textStart1 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+        final int pos1 = textStart1 & DocumentsWriter.CHAR_BLOCK_MASK;
+        final char[] text2 = charPool.buffers[textStart2 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+        final int pos2 = textStart2 & DocumentsWriter.CHAR_BLOCK_MASK;
+        return comparePostings(text1, pos1, text2, pos2);
+      }
+
+      @Override
+      protected void setPivot(int i) {
+        pivotTerm = postingsHash[i];
+        final int textStart = postingsArray.textStarts[pivotTerm];
+        pivotBuf = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+        pivotBufPos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+      }
+  
+      @Override
+      protected int comparePivot(int j) {
+        final int term = postingsHash[j];
+        if (pivotTerm == term)
+          return 0;
+        final int textStart = postingsArray.textStarts[term];
+        final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+        final int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+        return comparePostings(pivotBuf, pivotBufPos, text, pos);
+      }
+      
+      private int pivotTerm, pivotBufPos;
+      private char[] pivotBuf;
+
+      /** Compares term text for two Posting instance and
+       *  returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
+      private int comparePostings(final char[] text1, int pos1, final char[] text2, int pos2) {
+        assert text1 != text2 || pos1 != pos2;
+
+        while(true) {
+          final char c1 = text1[pos1++];
+          final char c2 = text2[pos2++];
+          if (c1 != c2) {
+            if (0xffff == c2)
+              return 1;
+            else if (0xffff == c1)
+              return -1;
+            else
+              return c1-c2;
+          } else
+            // This method should never compare equal postings
+            // unless p1==p2
+            assert c1 != 0xffff;
+        }
+      }
+    }.quickSort(0, numPostings-1);
+    return postingsHash;
+  }
+
+  /** Test whether the text for current RawPostingList p equals
+   *  current tokenText. */
+  private boolean postingEquals(final int termID, final char[] tokenText, final int tokenTextLen) {
+    final int textStart = postingsArray.textStarts[termID];
+    
+    final char[] text = perThread.charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+    assert text != null;
+    int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+
+    int tokenPos = 0;
+    for(;tokenPos<tokenTextLen;pos++,tokenPos++)
+      if (tokenText[tokenPos] != text[pos])
+        return false;
+    return 0xffff == text[pos];
+  }
+  
+  private boolean doCall;
+  private boolean doNextCall;
+
+  @Override
+  void start(Fieldable f) {
+    termAtt = fieldState.attributeSource.addAttribute(CharTermAttribute.class);
+    consumer.start(f);
+    if (nextPerField != null) {
+      nextPerField.start(f);
+    }
+  }
+  
+  @Override
+  boolean start(Fieldable[] fields, int count) throws IOException {
+    doCall = consumer.start(fields, count);
+    if (postingsArray == null) {
+      initPostingsArray();
+    }
+
+    if (nextPerField != null)
+      doNextCall = nextPerField.start(fields, count);
+    return doCall || doNextCall;
+  }
+
+  // Secondary entry point (for 2nd & subsequent TermsHash),
+  // because token text has already been "interned" into
+  // textStart, so we hash by textStart
+  public void add(int textStart) throws IOException {
+    int code = textStart;
+
+    int hashPos = code & postingsHashMask;
+
+    assert !postingsCompacted;
+
+    // Locate RawPostingList in hash
+    int termID = postingsHash[hashPos];
+
+    if (termID != -1 && postingsArray.textStarts[termID] != textStart) {
+      // Conflict: keep searching different locations in
+      // the hash table.
+      final int inc = ((code>>8)+code)|1;
+      do {
+        code += inc;
+        hashPos = code & postingsHashMask;
+        termID = postingsHash[hashPos];
+      } while (termID != -1 && postingsArray.textStarts[termID] != textStart);
+    }
+
+    if (termID == -1) {
+
+      // First time we are seeing this token since we last
+      // flushed the hash.
+
+      // New posting
+      termID = numPostings++;
+      if (termID >= postingsArray.size) {
+        growParallelPostingsArray();
+      }
+
+      assert termID >= 0;
+
+      postingsArray.textStarts[termID] = textStart;
+          
+      assert postingsHash[hashPos] == -1;
+      postingsHash[hashPos] = termID;
+
+      if (numPostings == postingsHashHalfSize)
+        rehashPostings(2*postingsHashSize);
+
+      // Init stream slices
+      if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
+        intPool.nextBuffer();
+
+      if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
+        bytePool.nextBuffer();
+
+      intUptos = intPool.buffer;
+      intUptoStart = intPool.intUpto;
+      intPool.intUpto += streamCount;
+
+      postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
+
+      for(int i=0;i<streamCount;i++) {
+        final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
+        intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
+      }
+      postingsArray.byteStarts[termID] = intUptos[intUptoStart];
+
+      consumer.newTerm(termID);
+
+    } else {
+      int intStart = postingsArray.intStarts[termID];
+      intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
+      intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
+      consumer.addTerm(termID);
+    }
+  }
+
+  // Primary entry point (for first TermsHash)
+  @Override
+  void add() throws IOException {
+
+    assert !postingsCompacted;
+
+    // We are first in the chain so we must "intern" the
+    // term text into textStart address
+
+    // Get the text of this term.
+    final char[] tokenText = termAtt.buffer();
+    final int tokenTextLen = termAtt.length();
+
+    // Compute hashcode & replace any invalid UTF16 sequences
+    int downto = tokenTextLen;
+    int code = 0;
+    while (downto > 0) {
+      char ch = tokenText[--downto];
+
+      if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) {
+        if (0 == downto) {
+          // Unpaired
+          ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
+        } else {
+          final char ch2 = tokenText[downto-1];
+          if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) {
+            // OK: high followed by low.  This is a valid
+            // surrogate pair.
+            code = ((code*31) + ch)*31+ch2;
+            downto--;
+            continue;
+          } else {
+            // Unpaired
+            ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
+          }            
+        }
+      } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END ||
+                                                          ch == 0xffff)) {
+        // Unpaired or 0xffff
+        ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
+      }
+
+      code = (code*31) + ch;
+    }
+
+    int hashPos = code & postingsHashMask;
+
+    // Locate RawPostingList in hash
+    int termID = postingsHash[hashPos];
+
+    if (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)) {
+      // Conflict: keep searching different locations in
+      // the hash table.
+      final int inc = ((code>>8)+code)|1;
+      do {
+        code += inc;
+        hashPos = code & postingsHashMask;
+        termID = postingsHash[hashPos];
+      } while (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen));
+    }
+
+    if (termID == -1) {
+
+      // First time we are seeing this token since we last
+      // flushed the hash.
+      final int textLen1 = 1+tokenTextLen;
+      if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) {
+        if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) {
+          // Just skip this term, to remain as robust as
+          // possible during indexing.  A TokenFilter
+          // can be inserted into the analyzer chain if
+          // other behavior is wanted (pruning the term
+          // to a prefix, throwing an exception, etc).
+
+          if (docState.maxTermPrefix == null)
+            docState.maxTermPrefix = new String(tokenText, 0, 30);
+
+          consumer.skippingLongTerm();
+          return;
+        }
+        charPool.nextBuffer();
+      }
+
+      // New posting
+      termID = numPostings++;
+      if (termID >= postingsArray.size) {
+        growParallelPostingsArray();
+      }
+
+      assert termID != -1;
+
+      final char[] text = charPool.buffer;
+      final int textUpto = charPool.charUpto;
+      postingsArray.textStarts[termID] = textUpto + charPool.charOffset;
+      charPool.charUpto += textLen1;
+      System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
+      text[textUpto+tokenTextLen] = 0xffff;
+          
+      assert postingsHash[hashPos] == -1;
+      postingsHash[hashPos] = termID;
+
+      if (numPostings == postingsHashHalfSize) {
+        rehashPostings(2*postingsHashSize);
+        bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT);
+      }
+
+      // Init stream slices
+      if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
+        intPool.nextBuffer();
+
+      if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
+        bytePool.nextBuffer();
+
+      intUptos = intPool.buffer;
+      intUptoStart = intPool.intUpto;
+      intPool.intUpto += streamCount;
+
+      postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
+
+      for(int i=0;i<streamCount;i++) {
+        final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
+        intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
+      }
+      postingsArray.byteStarts[termID] = intUptos[intUptoStart];
+      
+      consumer.newTerm(termID);
+
+    } else {
+      final int intStart = postingsArray.intStarts[termID];
+      intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
+      intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
+      consumer.addTerm(termID);
+    }
+
+    if (doNextCall)
+      nextPerField.add(postingsArray.textStarts[termID]);
+  }
+
+  int[] intUptos;
+  int intUptoStart;
+
+  void writeByte(int stream, byte b) {
+    int upto = intUptos[intUptoStart+stream];
+    byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
+    assert bytes != null;
+    int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK;
+    if (bytes[offset] != 0) {
+      // End of slice; allocate a new one
+      offset = bytePool.allocSlice(bytes, offset);
+      bytes = bytePool.buffer;
+      intUptos[intUptoStart+stream] = offset + bytePool.byteOffset;
+    }
+    bytes[offset] = b;
+    (intUptos[intUptoStart+stream])++;
+  }
+
+  public void writeBytes(int stream, byte[] b, int offset, int len) {
+    // TODO: optimize
+    final int end = offset + len;
+    for(int i=offset;i<end;i++)
+      writeByte(stream, b[i]);
+  }
+
+  void writeVInt(int stream, int i) {
+    assert stream < streamCount;
+    while ((i & ~0x7F) != 0) {
+      writeByte(stream, (byte)((i & 0x7f) | 0x80));
+      i >>>= 7;
+    }
+    writeByte(stream, (byte) i);
+  }
+
+  @Override
+  void finish() throws IOException {
+    try {
+      consumer.finish();
+    } finally {
+      if (nextPerField != null) {
+        nextPerField.finish();
+      }
+    }
+  }
+
+  /** Called when postings hash is too small (> 50%
+   *  occupied) or too large (< 20% occupied). */
+  void rehashPostings(final int newSize) {
+
+    final int newMask = newSize-1;
+
+    int[] newHash = new int[newSize];
+    Arrays.fill(newHash, -1);
+    for(int i=0;i<postingsHashSize;i++) {
+      int termID = postingsHash[i];
+      if (termID != -1) {
+        int code;
+        if (perThread.primary) {
+          final int textStart = postingsArray.textStarts[termID];
+          final int start = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+          final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+          int pos = start;
+          while(text[pos] != 0xffff)
+            pos++;
+          code = 0;
+          while (pos > start)
+            code = (code*31) + text[--pos];
+        } else
+          code = postingsArray.textStarts[termID];
+
+        int hashPos = code & newMask;
+        assert hashPos >= 0;
+        if (newHash[hashPos] != -1) {
+          final int inc = ((code>>8)+code)|1;
+          do {
+            code += inc;
+            hashPos = code & newMask;
+          } while (newHash[hashPos] != -1);
+        }
+        newHash[hashPos] = termID;
+      }
+    }
+
+    postingsHashMask = newMask;
+    postingsHash = newHash;
+
+    postingsHashSize = newSize;
+    postingsHashHalfSize = newSize >> 1;
+  }
+}