X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java
diff --git a/lucene-java-3.4.0/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java b/lucene-java-3.4.0/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java
deleted file mode 100644
index a99ccb3..0000000
--- a/lucene-java-3.4.0/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java
+++ /dev/null
@@ -1,717 +0,0 @@
-package org.apache.lucene.store.instantiated;
-
-/**
- * Copyright 2006 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.PrintStream;
-import java.io.StringReader;
-import java.io.Closeable;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Fieldable;
-import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermVectorOffsetInfo;
-import org.apache.lucene.search.Similarity;
-import org.apache.lucene.util.StringHelper;
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.CollectionUtil;
-import org.apache.lucene.util.AttributeImpl;
-import org.apache.lucene.util.BitVector;
-
-/**
- * This class, similar to {@link org.apache.lucene.index.IndexWriter}, has no locking mechanism.
- *
- * {@link org.apache.lucene.store.instantiated.InstantiatedIndexReader} is navigating
- * the same instances in memory as this writer is updating so searchers active while
- * you are committing are bound to throw exceptions.
- *
- * Consider using InstantiatedIndex as if it was immutable.
- *
- * @see org.apache.lucene.index.IndexWriter
- */
-public class InstantiatedIndexWriter implements Closeable {
-
- private PrintStream infoStream = null;
-
- private int maxFieldLength = IndexWriter.DEFAULT_MAX_FIELD_LENGTH;
-
- private final InstantiatedIndex index;
- private final Analyzer analyzer;
-
- private Similarity similarity = Similarity.getDefault(); // how to normalize;
-
- private transient Set fieldNameBuffer;
- /**
- * linked to ensure chronological order
- */
- private Map>> termDocumentInformationFactoryByDocument = new LinkedHashMap>>(2000);
-
- private Set unflushedDocuments = new HashSet();
-
- public InstantiatedIndexWriter(InstantiatedIndex index) throws IOException {
- this(index, null);
- }
-
- public InstantiatedIndexWriter(InstantiatedIndex index, Analyzer analyzer) throws IOException {
- this(index, analyzer, false);
- }
-
- public InstantiatedIndexWriter(InstantiatedIndex index, Analyzer analyzer, boolean create) throws IOException {
- this.index = index;
- this.analyzer = analyzer;
- fieldNameBuffer = new HashSet();
- if (create) {
- this.index.initialize();
- }
- }
-
- private int mergeFactor = 2500;
-
- /**
- * The sweetspot for this implementation is somewhere around 2500 at 2K text large documents.
- *
- * Benchmark output:
- *
- * ------------> Report sum by Prefix (MAddDocs) and Round (8 about 8 out of 160153)
- * Operation round mrg buf cmpnd runCnt recsPerRun rec/s elapsedSec avgUsedMem avgTotalMem
- * MAddDocs_20000 0 10 10 true 1 20000 81,4 245,68 200 325 152 268 156 928
- * MAddDocs_20000 - 1 1000 10 true - - 1 - - 20000 - - 494,1 - - 40,47 - 247 119 072 - 347 025 408
- * MAddDocs_20000 2 10 100 true 1 20000 104,8 190,81 233 895 552 363 720 704
- * MAddDocs_20000 - 3 2000 100 true - - 1 - - 20000 - - 527,2 - - 37,94 - 266 136 448 - 378 273 792
- * MAddDocs_20000 4 10 10 false 1 20000 103,2 193,75 222 089 792 378 273 792
- * MAddDocs_20000 - 5 3000 10 false - - 1 - - 20000 - - 545,2 - - 36,69 - 237 917 152 - 378 273 792
- * MAddDocs_20000 6 10 100 false 1 20000 102,7 194,67 237 018 976 378 273 792
- * MAddDocs_20000 - 7 4000 100 false - - 1 - - 20000 - - 535,8 - - 37,33 - 309 680 640 - 501 968 896
- *
- *
- * @see org.apache.lucene.index.LogMergePolicy#setMergeFactor(int)
- */
- public void setMergeFactor(int mergeFactor) {
- this.mergeFactor = mergeFactor;
- }
-
- /**
- * @see org.apache.lucene.index.LogMergePolicy#getMergeFactor()
- */
- public int getMergeFactor() {
- return mergeFactor;
- }
-
-
- /**
- * If non-null, information about merges and a message when
- * maxFieldLength is reached could be printed to this -- currently
- * not yet implemented.
- */
- public void setInfoStream(PrintStream infoStream) {
- this.infoStream = infoStream;
- }
-
-
- public void abort() throws IOException {
- // what not
- }
-
-
- public void addIndexes(IndexReader[] readers) {
- throw new RuntimeException("Not implemented");
- }
-
-
- public PrintStream getInfoStream() {
- return infoStream;
- }
-
-
- /**
- * Flushes all changes to an index and closes all associated files.
- */
- public void close() throws IOException {
- commit();
- }
-
- /**
- * Returns the number of documents currently in this index.
- */
- public int docCount() {
- // todo: not certain. see http://www.nabble.com/IndexWriter.docCount-tf3128882.html#a8669483
- return index.getDocumentsByNumber().length /* - index.getDeletedDocuments().size() */ + unflushedDocuments.size();
- }
-
- /**
- * Locks the index and commits the buffered documents.
- */
- public void commit() throws IOException {
-
- // todo write lock, unless held by caller
-
- boolean orderedTermsDirty = false;
- Set dirtyTerms = new HashSet(1000);
-
- Map fieldSettingsByFieldName = new HashMap();
- for (String fieldName : fieldNameBuffer) {
- fieldSettingsByFieldName.put(fieldName, new FieldSetting(fieldName));
- }
-
- InstantiatedDocument[] documentsByNumber = new InstantiatedDocument[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()];
- System.arraycopy(index.getDocumentsByNumber(), 0, documentsByNumber, 0, index.getDocumentsByNumber().length);
- int documentNumber = index.getDocumentsByNumber().length;
-
- List orderedTerms = new ArrayList(index.getOrderedTerms().length + 5000);
- for (InstantiatedTerm instantiatedTerm : index.getOrderedTerms()) {
- orderedTerms.add(instantiatedTerm);
- }
-
- // update norm array with fake values for new documents
- Map normsByFieldNameAndDocumentNumber = new HashMap(index.getTermsByFieldAndText().size());
- Set fieldNames = new HashSet(20);
- fieldNames.addAll(index.getNormsByFieldNameAndDocumentNumber().keySet());
- fieldNames.addAll(fieldNameBuffer);
- for (String field : index.getTermsByFieldAndText().keySet()) {
- byte[] norms = new byte[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()];
- byte[] oldNorms = index.getNormsByFieldNameAndDocumentNumber().get(field);
- if (oldNorms != null) {
- System.arraycopy(oldNorms, 0, norms, 0, oldNorms.length);
- Arrays.fill(norms, oldNorms.length, norms.length, similarity.encodeNormValue(1.0f));
- } else {
- Arrays.fill(norms, 0, norms.length, similarity.encodeNormValue(1.0f));
- }
- normsByFieldNameAndDocumentNumber.put(field, norms);
- fieldNames.remove(field);
- }
- for (String field : fieldNames) {
- //System.out.println(field);
- byte[] norms = new byte[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()];
- Arrays.fill(norms, 0, norms.length, similarity.encodeNormValue(1.0f));
- normsByFieldNameAndDocumentNumber.put(field, norms);
- }
- fieldNames.clear();
- index.setNormsByFieldNameAndDocumentNumber(normsByFieldNameAndDocumentNumber);
-
- for (Map.Entry>> eDocumentTermDocInfoByTermTextAndField : termDocumentInformationFactoryByDocument.entrySet()) {
-
- InstantiatedDocument document = eDocumentTermDocInfoByTermTextAndField.getKey();
-
- // assign document number
- document.setDocumentNumber(documentNumber++);
- documentsByNumber[document.getDocumentNumber()] = document;
-
- // set norms, prepare document and create optimized size collections.
-
- int numFieldsWithTermVectorsInDocument = 0;
- int termsInDocument = 0;
- for (Map.Entry> eFieldTermDocInfoFactoriesByTermText : eDocumentTermDocInfoByTermTextAndField.getValue().entrySet()) {
- if (eFieldTermDocInfoFactoriesByTermText.getKey().storeTermVector) {
- numFieldsWithTermVectorsInDocument += eFieldTermDocInfoFactoriesByTermText.getValue().size();
- }
- termsInDocument += eFieldTermDocInfoFactoriesByTermText.getValue().size();
-
- if (eFieldTermDocInfoFactoriesByTermText.getKey().indexed && !eFieldTermDocInfoFactoriesByTermText.getKey().omitNorms) {
- final FieldInvertState invertState = new FieldInvertState();
- invertState.setBoost(eFieldTermDocInfoFactoriesByTermText.getKey().boost * document.getDocument().getBoost());
- invertState.setLength(eFieldTermDocInfoFactoriesByTermText.getKey().fieldLength);
- final float norm = similarity.computeNorm(eFieldTermDocInfoFactoriesByTermText.getKey().fieldName, invertState);
- normsByFieldNameAndDocumentNumber.get(eFieldTermDocInfoFactoriesByTermText.getKey().fieldName)[document.getDocumentNumber()] = similarity.encodeNormValue(norm);
- } else {
- System.currentTimeMillis();
- }
-
- }
-
- /** used for term vectors only, i think.. */
- Map informationByTermOfCurrentDocument = new HashMap(termsInDocument);
-
-
- Map documentFieldSettingsByFieldName = new HashMap(eDocumentTermDocInfoByTermTextAndField.getValue().size());
-
- // terms...
- for (Map.Entry> eFieldSetting_TermDocInfoFactoriesByTermText : eDocumentTermDocInfoByTermTextAndField.getValue().entrySet()) {
- documentFieldSettingsByFieldName.put(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eFieldSetting_TermDocInfoFactoriesByTermText.getKey());
-
- // find or create term
- for (Map.Entry eTermText_TermDocInfoFactory : eFieldSetting_TermDocInfoFactoriesByTermText.getValue().entrySet()) {
-
- // get term..
- InstantiatedTerm term;
- Map termsByText = index.getTermsByFieldAndText().get(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName);
- if (termsByText == null) {
- termsByText = new HashMap(1000);
- index.getTermsByFieldAndText().put(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, termsByText);
- term = new InstantiatedTerm(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eTermText_TermDocInfoFactory.getKey());
- termsByText.put(eTermText_TermDocInfoFactory.getKey(), term);
- int pos = Collections.binarySearch(orderedTerms, term, InstantiatedTerm.comparator);
- pos = -1 - pos;
- orderedTerms.add(pos, term);
- orderedTermsDirty = true;
- } else {
- term = termsByText.get(eTermText_TermDocInfoFactory.getKey());
- if (term == null) {
- term = new InstantiatedTerm(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eTermText_TermDocInfoFactory.getKey());
- termsByText.put(eTermText_TermDocInfoFactory.getKey(), term);
- int pos = Collections.binarySearch(orderedTerms, term, InstantiatedTerm.comparator);
- pos = -1 - pos;
- orderedTerms.add(pos, term);
- orderedTermsDirty = true;
- }
- }
-
- // create association term document information
- //
- // [Term]-- {0..*} | {0..* ordered} --(field)[Document]
- //
- // |
- // [TermDocumentInformation]
-
- int[] positions = new int[eTermText_TermDocInfoFactory.getValue().termPositions.size()];
- for (int i = 0; i < positions.length; i++) {
- positions[i] = eTermText_TermDocInfoFactory.getValue().termPositions.get(i);
- }
-
- byte[][] payloads = new byte[eTermText_TermDocInfoFactory.getValue().payloads.size()][];
- for (int i = 0; i < payloads.length; i++) {
- payloads[i] = eTermText_TermDocInfoFactory.getValue().payloads.get(i);
- }
-
- // couple
-
- InstantiatedTermDocumentInformation info = new InstantiatedTermDocumentInformation(term, document, /*eTermText_TermDocInfoFactory.getValue().termFrequency,*/ positions, payloads);
-
- // todo optimize, this should be cached and updated to array in batches rather than appending the array once for every position!
- InstantiatedTermDocumentInformation[] associatedDocuments;
- if (term.getAssociatedDocuments() != null) {
- associatedDocuments = new InstantiatedTermDocumentInformation[term.getAssociatedDocuments().length + 1];
- System.arraycopy(term.getAssociatedDocuments(), 0, associatedDocuments, 0, term.getAssociatedDocuments().length);
- } else {
- associatedDocuments = new InstantiatedTermDocumentInformation[1];
- }
- associatedDocuments[associatedDocuments.length - 1] = info;
- term.setAssociatedDocuments(associatedDocuments);
-
- // todo optimize, only if term vector?
- informationByTermOfCurrentDocument.put(term, info);
-
-
- dirtyTerms.add(term);
- }
-
- // term vector offsets
- if (eFieldSetting_TermDocInfoFactoriesByTermText.getKey().storeOffsetWithTermVector) {
- for (Map.Entry e : informationByTermOfCurrentDocument.entrySet()) {
- if (eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName.equals(e.getKey().field())) {
- TermDocumentInformationFactory factory = eFieldSetting_TermDocInfoFactoriesByTermText.getValue().get(e.getKey().text());
- e.getValue().setTermOffsets(factory.termOffsets.toArray(new TermVectorOffsetInfo[factory.termOffsets.size()]));
- }
- }
- }
- }
-
- Map> termDocumentInformationsByField = new HashMap>();
- for (Map.Entry eTerm_TermDocumentInformation : informationByTermOfCurrentDocument.entrySet()) {
- List termDocumentInformations = termDocumentInformationsByField.get(eTerm_TermDocumentInformation.getKey().field());
- if (termDocumentInformations == null) {
- termDocumentInformations = new ArrayList();
- termDocumentInformationsByField.put(eTerm_TermDocumentInformation.getKey().field(), termDocumentInformations);
- }
- termDocumentInformations.add(eTerm_TermDocumentInformation.getValue());
- }
-
- for (Map.Entry> eField_TermDocInfos : termDocumentInformationsByField.entrySet()) {
-
- CollectionUtil.quickSort(eField_TermDocInfos.getValue(), tdComp);
-
- // add term vector
- if (documentFieldSettingsByFieldName.get(eField_TermDocInfos.getKey()).storeTermVector) {
- if (document.getVectorSpace() == null) {
- document.setVectorSpace(new HashMap>(documentFieldSettingsByFieldName.size()));
- }
- document.getVectorSpace().put(eField_TermDocInfos.getKey(), eField_TermDocInfos.getValue());
- }
-
- }
- fieldSettingsByFieldName.putAll(documentFieldSettingsByFieldName);
- }
-
- // order document informations in dirty terms
- for (InstantiatedTerm term : dirtyTerms) {
- // todo optimize, i believe this is useless, that the natural order is document number?
- ArrayUtil.mergeSort(term.getAssociatedDocuments(), InstantiatedTermDocumentInformation.documentNumberComparator);
-
-// // update association class reference for speedy skipTo()
-// for (int i = 0; i < term.getAssociatedDocuments().length; i++) {
-// term.getAssociatedDocuments()[i].setIndexFromTerm(i);
-// }
- }
-
-
- // flush to writer
- index.setDocumentsByNumber(documentsByNumber);
- index.setOrderedTerms(orderedTerms.toArray(new InstantiatedTerm[orderedTerms.size()]));
-
- for (FieldSetting fieldSetting : fieldSettingsByFieldName.values()) {
- index.getFieldSettings().merge(fieldSetting);
- }
- // set term index
- if (orderedTermsDirty) {
- // todo optimize, only update from start position
- for (int i = 0; i < index.getOrderedTerms().length; i++) {
- index.getOrderedTerms()[i].setTermIndex(i);
- }
-
- }
-
- // remove deleted documents
- IndexReader indexDeleter = index.indexReaderFactory();
- if (unflushedDeletions.size() > 0) {
- for (Term term : unflushedDeletions) {
- indexDeleter.deleteDocuments(term);
- }
- unflushedDeletions.clear();
- }
-
-
- // all done, clear buffers
- unflushedDocuments.clear();
- termDocumentInformationFactoryByDocument.clear();
- fieldNameBuffer.clear();
-
-
- // update deleted documents bitset
- if (index.getDeletedDocuments() != null) {
- BitVector deletedDocuments = new BitVector(index.getDocumentsByNumber().length);
- for (int i = 0; i < index.getDeletedDocuments().size(); i++) {
- if (index.getDeletedDocuments().get(i)) {
- deletedDocuments.set(i);
- }
- }
- index.setDeletedDocuments(deletedDocuments);
- }
-
- index.setVersion(System.currentTimeMillis());
-
- // todo unlock
-
- indexDeleter.close();
-
- }
-
- private static final Comparator tdComp = new Comparator() {
- public int compare(InstantiatedTermDocumentInformation instantiatedTermDocumentInformation, InstantiatedTermDocumentInformation instantiatedTermDocumentInformation1) {
- return instantiatedTermDocumentInformation.getTerm().getTerm().compareTo(instantiatedTermDocumentInformation1.getTerm().getTerm());
- }
- };
-
- /**
- * Adds a document to this index. If the document contains more than
- * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
- * discarded.
- */
- public void addDocument(Document doc) throws IOException {
- addDocument(doc, getAnalyzer());
- }
-
- /**
- * Adds a document to this index, using the provided analyzer instead of the
- * value of {@link #getAnalyzer()}. If the document contains more than
- * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
- * discarded.
- *
- * @param doc
- * @param analyzer
- * @throws IOException
- */
- public void addDocument(Document doc, Analyzer analyzer) throws IOException {
- addDocument(new InstantiatedDocument(doc), analyzer);
- }
-
- /**
- * Tokenizes a document and adds it to the buffer.
- * Try to do all calculations in this method rather than in commit, as this is a non locking method.
- * Remember, this index implementation expects unlimited memory for maximum speed.
- *
- * @param document
- * @param analyzer
- * @throws IOException
- */
- protected void addDocument(InstantiatedDocument document, Analyzer analyzer) throws IOException {
-
- if (document.getDocumentNumber() != null) {
- throw new RuntimeException("Document number already set! Are you trying to add a document that already is bound to this or another index?");
- }
-
- // todo: write lock
-
- // normalize settings per field name in document
-
- Map fieldSettingsByFieldName = new HashMap();
- for (Fieldable field : document.getDocument().getFields()) {
- FieldSetting fieldSetting = fieldSettingsByFieldName.get(field.name());
- if (fieldSetting == null) {
- fieldSetting = new FieldSetting();
- fieldSetting.fieldName = StringHelper.intern(field.name());
- fieldSettingsByFieldName.put(fieldSetting.fieldName, fieldSetting);
- fieldNameBuffer.add(fieldSetting.fieldName);
- }
-
- // todo: fixme: multiple fields with the same name does not mean field boost += more boost.
- fieldSetting.boost *= field.getBoost();
- //fieldSettings.dimensions++;
-
-
- // once fieldSettings, always fieldSettings.
- if (field.getOmitNorms()) {
- fieldSetting.omitNorms = true;
- }
- if (field.isIndexed() ) {
- fieldSetting.indexed = true;
- }
- if (field.isTokenized()) {
- fieldSetting.tokenized = true;
- }
- if (field.isStored()) {
- fieldSetting.stored = true;
- }
- if (field.isBinary()) {
- fieldSetting.isBinary = true;
- }
- if (field.isTermVectorStored()) {
- fieldSetting.storeTermVector = true;
- }
- if (field.isStorePositionWithTermVector()) {
- fieldSetting.storePositionWithTermVector = true;
- }
- if (field.isStoreOffsetWithTermVector()) {
- fieldSetting.storeOffsetWithTermVector = true;
- }
- }
-
- Map> tokensByField = new LinkedHashMap>(20);
-
- // tokenize indexed fields.
- for (Iterator it = document.getDocument().getFields().iterator(); it.hasNext();) {
-
- Fieldable field = it.next();
-
- FieldSetting fieldSetting = fieldSettingsByFieldName.get(field.name());
-
- if (field.isIndexed()) {
-
- LinkedList tokens = new LinkedList();
- tokensByField.put(field, tokens);
-
- if (field.isTokenized()) {
- final TokenStream tokenStream;
- // todo readerValue(), binaryValue()
- if (field.tokenStreamValue() != null) {
- tokenStream = field.tokenStreamValue();
- } else {
- tokenStream = analyzer.reusableTokenStream(field.name(), new StringReader(field.stringValue()));
- }
-
- // reset the TokenStream to the first token
- tokenStream.reset();
-
- while (tokenStream.incrementToken()) {
- // TODO: this is a simple workaround to still work with tokens, not very effective, but as far as I know, this writer should get removed soon:
- final Token token = new Token();
- for (Iterator atts = tokenStream.getAttributeImplsIterator(); atts.hasNext();) {
- final AttributeImpl att = atts.next();
- try {
- att.copyTo(token);
- } catch (Exception e) {
- // ignore unsupported attributes,
- // this may fail to copy some attributes, if a special combined AttributeImpl is used, that
- // implements basic attributes supported by Token and also other customized ones in one class.
- }
- }
- tokens.add(token); // the vector will be built on commit.
- fieldSetting.fieldLength++;
- if (fieldSetting.fieldLength > maxFieldLength) {
- break;
- }
- }
- tokenStream.end();
- tokenStream.close();
- } else {
- // untokenized
- String fieldVal = field.stringValue();
- Token token = new Token(0, fieldVal.length(), "untokenized");
- token.setEmpty().append(fieldVal);
- tokens.add(token);
- fieldSetting.fieldLength++;
- }
- }
-
- if (!field.isStored()) {
- it.remove();
- }
- }
-
-
- Map> termDocumentInformationFactoryByTermTextAndFieldSetting = new HashMap>();
- termDocumentInformationFactoryByDocument.put(document, termDocumentInformationFactoryByTermTextAndFieldSetting);
-
- // build term vector, term positions and term offsets
- for (Map.Entry> eField_Tokens : tokensByField.entrySet()) {
- FieldSetting fieldSetting = fieldSettingsByFieldName.get(eField_Tokens.getKey().name());
-
- Map termDocumentInformationFactoryByTermText = termDocumentInformationFactoryByTermTextAndFieldSetting.get(fieldSettingsByFieldName.get(eField_Tokens.getKey().name()));
- if (termDocumentInformationFactoryByTermText == null) {
- termDocumentInformationFactoryByTermText = new HashMap();
- termDocumentInformationFactoryByTermTextAndFieldSetting.put(fieldSettingsByFieldName.get(eField_Tokens.getKey().name()), termDocumentInformationFactoryByTermText);
- }
-
- int lastOffset = 0;
-
- // for each new field, move positions a bunch.
- if (fieldSetting.position > 0) {
- // todo what if no analyzer set, multiple fields with same name and index without tokenization?
- fieldSetting.position += analyzer.getPositionIncrementGap(fieldSetting.fieldName);
- }
-
- for (Token token : eField_Tokens.getValue()) {
-
- TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.toString());
- if (termDocumentInformationFactory == null) {
- termDocumentInformationFactory = new TermDocumentInformationFactory();
- termDocumentInformationFactoryByTermText.put(token.toString(), termDocumentInformationFactory);
- }
- //termDocumentInformationFactory.termFrequency++;
-
- fieldSetting.position += (token.getPositionIncrement() - 1);
- termDocumentInformationFactory.termPositions.add(fieldSetting.position++);
-
- if (token.getPayload() != null && token.getPayload().length() > 0) {
- termDocumentInformationFactory.payloads.add(token.getPayload().toByteArray());
- fieldSetting.storePayloads = true;
- } else {
- termDocumentInformationFactory.payloads.add(null);
- }
-
- if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) {
-
- termDocumentInformationFactory.termOffsets.add(new TermVectorOffsetInfo(fieldSetting.offset + token.startOffset(), fieldSetting.offset + token.endOffset()));
- lastOffset = fieldSetting.offset + token.endOffset();
- }
-
-
- }
-
- if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) {
- fieldSetting.offset = lastOffset + 1;
- }
-
- }
-
-
- unflushedDocuments.add(document);
-
- // if too many documents in buffer, commit.
- if (unflushedDocuments.size() >= getMergeFactor()) {
- commit(/*lock*/);
- }
-
- // todo: unlock write lock
-
- }
-
-
- private Set unflushedDeletions = new HashSet();
-
- public void deleteDocuments(Term term) throws IOException {
- unflushedDeletions.add(term);
- }
-
- public void deleteDocuments(Term[] terms) throws IOException {
- for (Term term : terms) {
- deleteDocuments(term);
- }
- }
-
- public void updateDocument(Term term, Document doc) throws IOException {
- updateDocument(term, doc, getAnalyzer());
- }
-
- public void updateDocument(Term term, Document doc, Analyzer analyzer) throws IOException {
- deleteDocuments(term);
- addDocument(doc, analyzer);
- }
-
- public int getMaxFieldLength() {
- return maxFieldLength;
- }
-
- public void setMaxFieldLength(int maxFieldLength) {
- this.maxFieldLength = maxFieldLength;
- }
-
- public Similarity getSimilarity() {
- return similarity;
- }
-
- public void setSimilarity(Similarity similarity) {
- this.similarity = similarity;
- }
-
- public Analyzer getAnalyzer() {
- return analyzer;
- }
-
- private class TermDocumentInformationFactory {
- private LinkedList payloads = new LinkedList();
- private LinkedList termPositions = new LinkedList();
- private LinkedList termOffsets = new LinkedList();
- }
-
-
- static class FieldSetting extends org.apache.lucene.store.instantiated.FieldSetting {
-
- float boost = 1;
- int position = 0;
- int offset;
- int fieldLength = 0;
-
- boolean omitNorms = false;
- boolean isBinary = false;
-
- private FieldSetting() {
- }
-
- private FieldSetting(String fieldName) {
- super(fieldName);
- }
- }
-
-
-}