X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java diff --git a/lucene-java-3.4.0/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java b/lucene-java-3.4.0/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java deleted file mode 100644 index a99ccb3..0000000 --- a/lucene-java-3.4.0/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java +++ /dev/null @@ -1,717 +0,0 @@ -package org.apache.lucene.store.instantiated; - -/** - * Copyright 2006 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.PrintStream; -import java.io.StringReader; -import java.io.Closeable; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Fieldable; -import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermVectorOffsetInfo; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.util.StringHelper; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.CollectionUtil; -import org.apache.lucene.util.AttributeImpl; -import org.apache.lucene.util.BitVector; - -/** - * This class, similar to {@link org.apache.lucene.index.IndexWriter}, has no locking mechanism. - *

- * {@link org.apache.lucene.store.instantiated.InstantiatedIndexReader} is navigating - * the same instances in memory as this writer is updating so searchers active while - * you are committing are bound to throw exceptions. - *

- * Consider using InstantiatedIndex as if it was immutable. - * - * @see org.apache.lucene.index.IndexWriter - */ -public class InstantiatedIndexWriter implements Closeable { - - private PrintStream infoStream = null; - - private int maxFieldLength = IndexWriter.DEFAULT_MAX_FIELD_LENGTH; - - private final InstantiatedIndex index; - private final Analyzer analyzer; - - private Similarity similarity = Similarity.getDefault(); // how to normalize; - - private transient Set fieldNameBuffer; - /** - * linked to ensure chronological order - */ - private Map>> termDocumentInformationFactoryByDocument = new LinkedHashMap>>(2000); - - private Set unflushedDocuments = new HashSet(); - - public InstantiatedIndexWriter(InstantiatedIndex index) throws IOException { - this(index, null); - } - - public InstantiatedIndexWriter(InstantiatedIndex index, Analyzer analyzer) throws IOException { - this(index, analyzer, false); - } - - public InstantiatedIndexWriter(InstantiatedIndex index, Analyzer analyzer, boolean create) throws IOException { - this.index = index; - this.analyzer = analyzer; - fieldNameBuffer = new HashSet(); - if (create) { - this.index.initialize(); - } - } - - private int mergeFactor = 2500; - - /** - * The sweetspot for this implementation is somewhere around 2500 at 2K text large documents. - *

- * Benchmark output: - *

-   *  ------------> Report sum by Prefix (MAddDocs) and Round (8 about 8 out of 160153)
-   *  Operation      round  mrg buf cmpnd   runCnt   recsPerRun        rec/s  elapsedSec    avgUsedMem    avgTotalMem
-   *  MAddDocs_20000     0   10  10  true        1        20000         81,4      245,68   200 325 152    268 156 928
-   *  MAddDocs_20000 -   1 1000  10  true -  -   1 -  -   20000 -  -   494,1 -  -  40,47 - 247 119 072 -  347 025 408
-   *  MAddDocs_20000     2   10 100  true        1        20000        104,8      190,81   233 895 552    363 720 704
-   *  MAddDocs_20000 -   3 2000 100  true -  -   1 -  -   20000 -  -   527,2 -  -  37,94 - 266 136 448 -  378 273 792
-   *  MAddDocs_20000     4   10  10 false        1        20000        103,2      193,75   222 089 792    378 273 792
-   *  MAddDocs_20000 -   5 3000  10 false -  -   1 -  -   20000 -  -   545,2 -  -  36,69 - 237 917 152 -  378 273 792
-   *  MAddDocs_20000     6   10 100 false        1        20000        102,7      194,67   237 018 976    378 273 792
-   *  MAddDocs_20000 -   7 4000 100 false -  -   1 -  -   20000 -  -   535,8 -  -  37,33 - 309 680 640 -  501 968 896
-   * 
- * - * @see org.apache.lucene.index.LogMergePolicy#setMergeFactor(int) - */ - public void setMergeFactor(int mergeFactor) { - this.mergeFactor = mergeFactor; - } - - /** - * @see org.apache.lucene.index.LogMergePolicy#getMergeFactor() - */ - public int getMergeFactor() { - return mergeFactor; - } - - - /** - * If non-null, information about merges and a message when - * maxFieldLength is reached could be printed to this -- currently - * not yet implemented. - */ - public void setInfoStream(PrintStream infoStream) { - this.infoStream = infoStream; - } - - - public void abort() throws IOException { - // what not - } - - - public void addIndexes(IndexReader[] readers) { - throw new RuntimeException("Not implemented"); - } - - - public PrintStream getInfoStream() { - return infoStream; - } - - - /** - * Flushes all changes to an index and closes all associated files. - */ - public void close() throws IOException { - commit(); - } - - /** - * Returns the number of documents currently in this index. - */ - public int docCount() { - // todo: not certain. see http://www.nabble.com/IndexWriter.docCount-tf3128882.html#a8669483 - return index.getDocumentsByNumber().length /* - index.getDeletedDocuments().size() */ + unflushedDocuments.size(); - } - - /** - * Locks the index and commits the buffered documents. - */ - public void commit() throws IOException { - - // todo write lock, unless held by caller - - boolean orderedTermsDirty = false; - Set dirtyTerms = new HashSet(1000); - - Map fieldSettingsByFieldName = new HashMap(); - for (String fieldName : fieldNameBuffer) { - fieldSettingsByFieldName.put(fieldName, new FieldSetting(fieldName)); - } - - InstantiatedDocument[] documentsByNumber = new InstantiatedDocument[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()]; - System.arraycopy(index.getDocumentsByNumber(), 0, documentsByNumber, 0, index.getDocumentsByNumber().length); - int documentNumber = index.getDocumentsByNumber().length; - - List orderedTerms = new ArrayList(index.getOrderedTerms().length + 5000); - for (InstantiatedTerm instantiatedTerm : index.getOrderedTerms()) { - orderedTerms.add(instantiatedTerm); - } - - // update norm array with fake values for new documents - Map normsByFieldNameAndDocumentNumber = new HashMap(index.getTermsByFieldAndText().size()); - Set fieldNames = new HashSet(20); - fieldNames.addAll(index.getNormsByFieldNameAndDocumentNumber().keySet()); - fieldNames.addAll(fieldNameBuffer); - for (String field : index.getTermsByFieldAndText().keySet()) { - byte[] norms = new byte[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()]; - byte[] oldNorms = index.getNormsByFieldNameAndDocumentNumber().get(field); - if (oldNorms != null) { - System.arraycopy(oldNorms, 0, norms, 0, oldNorms.length); - Arrays.fill(norms, oldNorms.length, norms.length, similarity.encodeNormValue(1.0f)); - } else { - Arrays.fill(norms, 0, norms.length, similarity.encodeNormValue(1.0f)); - } - normsByFieldNameAndDocumentNumber.put(field, norms); - fieldNames.remove(field); - } - for (String field : fieldNames) { - //System.out.println(field); - byte[] norms = new byte[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()]; - Arrays.fill(norms, 0, norms.length, similarity.encodeNormValue(1.0f)); - normsByFieldNameAndDocumentNumber.put(field, norms); - } - fieldNames.clear(); - index.setNormsByFieldNameAndDocumentNumber(normsByFieldNameAndDocumentNumber); - - for (Map.Entry>> eDocumentTermDocInfoByTermTextAndField : termDocumentInformationFactoryByDocument.entrySet()) { - - InstantiatedDocument document = eDocumentTermDocInfoByTermTextAndField.getKey(); - - // assign document number - document.setDocumentNumber(documentNumber++); - documentsByNumber[document.getDocumentNumber()] = document; - - // set norms, prepare document and create optimized size collections. - - int numFieldsWithTermVectorsInDocument = 0; - int termsInDocument = 0; - for (Map.Entry> eFieldTermDocInfoFactoriesByTermText : eDocumentTermDocInfoByTermTextAndField.getValue().entrySet()) { - if (eFieldTermDocInfoFactoriesByTermText.getKey().storeTermVector) { - numFieldsWithTermVectorsInDocument += eFieldTermDocInfoFactoriesByTermText.getValue().size(); - } - termsInDocument += eFieldTermDocInfoFactoriesByTermText.getValue().size(); - - if (eFieldTermDocInfoFactoriesByTermText.getKey().indexed && !eFieldTermDocInfoFactoriesByTermText.getKey().omitNorms) { - final FieldInvertState invertState = new FieldInvertState(); - invertState.setBoost(eFieldTermDocInfoFactoriesByTermText.getKey().boost * document.getDocument().getBoost()); - invertState.setLength(eFieldTermDocInfoFactoriesByTermText.getKey().fieldLength); - final float norm = similarity.computeNorm(eFieldTermDocInfoFactoriesByTermText.getKey().fieldName, invertState); - normsByFieldNameAndDocumentNumber.get(eFieldTermDocInfoFactoriesByTermText.getKey().fieldName)[document.getDocumentNumber()] = similarity.encodeNormValue(norm); - } else { - System.currentTimeMillis(); - } - - } - - /** used for term vectors only, i think.. */ - Map informationByTermOfCurrentDocument = new HashMap(termsInDocument); - - - Map documentFieldSettingsByFieldName = new HashMap(eDocumentTermDocInfoByTermTextAndField.getValue().size()); - - // terms... - for (Map.Entry> eFieldSetting_TermDocInfoFactoriesByTermText : eDocumentTermDocInfoByTermTextAndField.getValue().entrySet()) { - documentFieldSettingsByFieldName.put(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eFieldSetting_TermDocInfoFactoriesByTermText.getKey()); - - // find or create term - for (Map.Entry eTermText_TermDocInfoFactory : eFieldSetting_TermDocInfoFactoriesByTermText.getValue().entrySet()) { - - // get term.. - InstantiatedTerm term; - Map termsByText = index.getTermsByFieldAndText().get(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName); - if (termsByText == null) { - termsByText = new HashMap(1000); - index.getTermsByFieldAndText().put(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, termsByText); - term = new InstantiatedTerm(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eTermText_TermDocInfoFactory.getKey()); - termsByText.put(eTermText_TermDocInfoFactory.getKey(), term); - int pos = Collections.binarySearch(orderedTerms, term, InstantiatedTerm.comparator); - pos = -1 - pos; - orderedTerms.add(pos, term); - orderedTermsDirty = true; - } else { - term = termsByText.get(eTermText_TermDocInfoFactory.getKey()); - if (term == null) { - term = new InstantiatedTerm(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eTermText_TermDocInfoFactory.getKey()); - termsByText.put(eTermText_TermDocInfoFactory.getKey(), term); - int pos = Collections.binarySearch(orderedTerms, term, InstantiatedTerm.comparator); - pos = -1 - pos; - orderedTerms.add(pos, term); - orderedTermsDirty = true; - } - } - - // create association term document information - // - // [Term]-- {0..*} | {0..* ordered} --(field)[Document] - // - // | - // [TermDocumentInformation] - - int[] positions = new int[eTermText_TermDocInfoFactory.getValue().termPositions.size()]; - for (int i = 0; i < positions.length; i++) { - positions[i] = eTermText_TermDocInfoFactory.getValue().termPositions.get(i); - } - - byte[][] payloads = new byte[eTermText_TermDocInfoFactory.getValue().payloads.size()][]; - for (int i = 0; i < payloads.length; i++) { - payloads[i] = eTermText_TermDocInfoFactory.getValue().payloads.get(i); - } - - // couple - - InstantiatedTermDocumentInformation info = new InstantiatedTermDocumentInformation(term, document, /*eTermText_TermDocInfoFactory.getValue().termFrequency,*/ positions, payloads); - - // todo optimize, this should be cached and updated to array in batches rather than appending the array once for every position! - InstantiatedTermDocumentInformation[] associatedDocuments; - if (term.getAssociatedDocuments() != null) { - associatedDocuments = new InstantiatedTermDocumentInformation[term.getAssociatedDocuments().length + 1]; - System.arraycopy(term.getAssociatedDocuments(), 0, associatedDocuments, 0, term.getAssociatedDocuments().length); - } else { - associatedDocuments = new InstantiatedTermDocumentInformation[1]; - } - associatedDocuments[associatedDocuments.length - 1] = info; - term.setAssociatedDocuments(associatedDocuments); - - // todo optimize, only if term vector? - informationByTermOfCurrentDocument.put(term, info); - - - dirtyTerms.add(term); - } - - // term vector offsets - if (eFieldSetting_TermDocInfoFactoriesByTermText.getKey().storeOffsetWithTermVector) { - for (Map.Entry e : informationByTermOfCurrentDocument.entrySet()) { - if (eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName.equals(e.getKey().field())) { - TermDocumentInformationFactory factory = eFieldSetting_TermDocInfoFactoriesByTermText.getValue().get(e.getKey().text()); - e.getValue().setTermOffsets(factory.termOffsets.toArray(new TermVectorOffsetInfo[factory.termOffsets.size()])); - } - } - } - } - - Map> termDocumentInformationsByField = new HashMap>(); - for (Map.Entry eTerm_TermDocumentInformation : informationByTermOfCurrentDocument.entrySet()) { - List termDocumentInformations = termDocumentInformationsByField.get(eTerm_TermDocumentInformation.getKey().field()); - if (termDocumentInformations == null) { - termDocumentInformations = new ArrayList(); - termDocumentInformationsByField.put(eTerm_TermDocumentInformation.getKey().field(), termDocumentInformations); - } - termDocumentInformations.add(eTerm_TermDocumentInformation.getValue()); - } - - for (Map.Entry> eField_TermDocInfos : termDocumentInformationsByField.entrySet()) { - - CollectionUtil.quickSort(eField_TermDocInfos.getValue(), tdComp); - - // add term vector - if (documentFieldSettingsByFieldName.get(eField_TermDocInfos.getKey()).storeTermVector) { - if (document.getVectorSpace() == null) { - document.setVectorSpace(new HashMap>(documentFieldSettingsByFieldName.size())); - } - document.getVectorSpace().put(eField_TermDocInfos.getKey(), eField_TermDocInfos.getValue()); - } - - } - fieldSettingsByFieldName.putAll(documentFieldSettingsByFieldName); - } - - // order document informations in dirty terms - for (InstantiatedTerm term : dirtyTerms) { - // todo optimize, i believe this is useless, that the natural order is document number? - ArrayUtil.mergeSort(term.getAssociatedDocuments(), InstantiatedTermDocumentInformation.documentNumberComparator); - -// // update association class reference for speedy skipTo() -// for (int i = 0; i < term.getAssociatedDocuments().length; i++) { -// term.getAssociatedDocuments()[i].setIndexFromTerm(i); -// } - } - - - // flush to writer - index.setDocumentsByNumber(documentsByNumber); - index.setOrderedTerms(orderedTerms.toArray(new InstantiatedTerm[orderedTerms.size()])); - - for (FieldSetting fieldSetting : fieldSettingsByFieldName.values()) { - index.getFieldSettings().merge(fieldSetting); - } - // set term index - if (orderedTermsDirty) { - // todo optimize, only update from start position - for (int i = 0; i < index.getOrderedTerms().length; i++) { - index.getOrderedTerms()[i].setTermIndex(i); - } - - } - - // remove deleted documents - IndexReader indexDeleter = index.indexReaderFactory(); - if (unflushedDeletions.size() > 0) { - for (Term term : unflushedDeletions) { - indexDeleter.deleteDocuments(term); - } - unflushedDeletions.clear(); - } - - - // all done, clear buffers - unflushedDocuments.clear(); - termDocumentInformationFactoryByDocument.clear(); - fieldNameBuffer.clear(); - - - // update deleted documents bitset - if (index.getDeletedDocuments() != null) { - BitVector deletedDocuments = new BitVector(index.getDocumentsByNumber().length); - for (int i = 0; i < index.getDeletedDocuments().size(); i++) { - if (index.getDeletedDocuments().get(i)) { - deletedDocuments.set(i); - } - } - index.setDeletedDocuments(deletedDocuments); - } - - index.setVersion(System.currentTimeMillis()); - - // todo unlock - - indexDeleter.close(); - - } - - private static final Comparator tdComp = new Comparator() { - public int compare(InstantiatedTermDocumentInformation instantiatedTermDocumentInformation, InstantiatedTermDocumentInformation instantiatedTermDocumentInformation1) { - return instantiatedTermDocumentInformation.getTerm().getTerm().compareTo(instantiatedTermDocumentInformation1.getTerm().getTerm()); - } - }; - - /** - * Adds a document to this index. If the document contains more than - * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are - * discarded. - */ - public void addDocument(Document doc) throws IOException { - addDocument(doc, getAnalyzer()); - } - - /** - * Adds a document to this index, using the provided analyzer instead of the - * value of {@link #getAnalyzer()}. If the document contains more than - * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are - * discarded. - * - * @param doc - * @param analyzer - * @throws IOException - */ - public void addDocument(Document doc, Analyzer analyzer) throws IOException { - addDocument(new InstantiatedDocument(doc), analyzer); - } - - /** - * Tokenizes a document and adds it to the buffer. - * Try to do all calculations in this method rather than in commit, as this is a non locking method. - * Remember, this index implementation expects unlimited memory for maximum speed. - * - * @param document - * @param analyzer - * @throws IOException - */ - protected void addDocument(InstantiatedDocument document, Analyzer analyzer) throws IOException { - - if (document.getDocumentNumber() != null) { - throw new RuntimeException("Document number already set! Are you trying to add a document that already is bound to this or another index?"); - } - - // todo: write lock - - // normalize settings per field name in document - - Map fieldSettingsByFieldName = new HashMap(); - for (Fieldable field : document.getDocument().getFields()) { - FieldSetting fieldSetting = fieldSettingsByFieldName.get(field.name()); - if (fieldSetting == null) { - fieldSetting = new FieldSetting(); - fieldSetting.fieldName = StringHelper.intern(field.name()); - fieldSettingsByFieldName.put(fieldSetting.fieldName, fieldSetting); - fieldNameBuffer.add(fieldSetting.fieldName); - } - - // todo: fixme: multiple fields with the same name does not mean field boost += more boost. - fieldSetting.boost *= field.getBoost(); - //fieldSettings.dimensions++; - - - // once fieldSettings, always fieldSettings. - if (field.getOmitNorms()) { - fieldSetting.omitNorms = true; - } - if (field.isIndexed() ) { - fieldSetting.indexed = true; - } - if (field.isTokenized()) { - fieldSetting.tokenized = true; - } - if (field.isStored()) { - fieldSetting.stored = true; - } - if (field.isBinary()) { - fieldSetting.isBinary = true; - } - if (field.isTermVectorStored()) { - fieldSetting.storeTermVector = true; - } - if (field.isStorePositionWithTermVector()) { - fieldSetting.storePositionWithTermVector = true; - } - if (field.isStoreOffsetWithTermVector()) { - fieldSetting.storeOffsetWithTermVector = true; - } - } - - Map> tokensByField = new LinkedHashMap>(20); - - // tokenize indexed fields. - for (Iterator it = document.getDocument().getFields().iterator(); it.hasNext();) { - - Fieldable field = it.next(); - - FieldSetting fieldSetting = fieldSettingsByFieldName.get(field.name()); - - if (field.isIndexed()) { - - LinkedList tokens = new LinkedList(); - tokensByField.put(field, tokens); - - if (field.isTokenized()) { - final TokenStream tokenStream; - // todo readerValue(), binaryValue() - if (field.tokenStreamValue() != null) { - tokenStream = field.tokenStreamValue(); - } else { - tokenStream = analyzer.reusableTokenStream(field.name(), new StringReader(field.stringValue())); - } - - // reset the TokenStream to the first token - tokenStream.reset(); - - while (tokenStream.incrementToken()) { - // TODO: this is a simple workaround to still work with tokens, not very effective, but as far as I know, this writer should get removed soon: - final Token token = new Token(); - for (Iterator atts = tokenStream.getAttributeImplsIterator(); atts.hasNext();) { - final AttributeImpl att = atts.next(); - try { - att.copyTo(token); - } catch (Exception e) { - // ignore unsupported attributes, - // this may fail to copy some attributes, if a special combined AttributeImpl is used, that - // implements basic attributes supported by Token and also other customized ones in one class. - } - } - tokens.add(token); // the vector will be built on commit. - fieldSetting.fieldLength++; - if (fieldSetting.fieldLength > maxFieldLength) { - break; - } - } - tokenStream.end(); - tokenStream.close(); - } else { - // untokenized - String fieldVal = field.stringValue(); - Token token = new Token(0, fieldVal.length(), "untokenized"); - token.setEmpty().append(fieldVal); - tokens.add(token); - fieldSetting.fieldLength++; - } - } - - if (!field.isStored()) { - it.remove(); - } - } - - - Map> termDocumentInformationFactoryByTermTextAndFieldSetting = new HashMap>(); - termDocumentInformationFactoryByDocument.put(document, termDocumentInformationFactoryByTermTextAndFieldSetting); - - // build term vector, term positions and term offsets - for (Map.Entry> eField_Tokens : tokensByField.entrySet()) { - FieldSetting fieldSetting = fieldSettingsByFieldName.get(eField_Tokens.getKey().name()); - - Map termDocumentInformationFactoryByTermText = termDocumentInformationFactoryByTermTextAndFieldSetting.get(fieldSettingsByFieldName.get(eField_Tokens.getKey().name())); - if (termDocumentInformationFactoryByTermText == null) { - termDocumentInformationFactoryByTermText = new HashMap(); - termDocumentInformationFactoryByTermTextAndFieldSetting.put(fieldSettingsByFieldName.get(eField_Tokens.getKey().name()), termDocumentInformationFactoryByTermText); - } - - int lastOffset = 0; - - // for each new field, move positions a bunch. - if (fieldSetting.position > 0) { - // todo what if no analyzer set, multiple fields with same name and index without tokenization? - fieldSetting.position += analyzer.getPositionIncrementGap(fieldSetting.fieldName); - } - - for (Token token : eField_Tokens.getValue()) { - - TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.toString()); - if (termDocumentInformationFactory == null) { - termDocumentInformationFactory = new TermDocumentInformationFactory(); - termDocumentInformationFactoryByTermText.put(token.toString(), termDocumentInformationFactory); - } - //termDocumentInformationFactory.termFrequency++; - - fieldSetting.position += (token.getPositionIncrement() - 1); - termDocumentInformationFactory.termPositions.add(fieldSetting.position++); - - if (token.getPayload() != null && token.getPayload().length() > 0) { - termDocumentInformationFactory.payloads.add(token.getPayload().toByteArray()); - fieldSetting.storePayloads = true; - } else { - termDocumentInformationFactory.payloads.add(null); - } - - if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) { - - termDocumentInformationFactory.termOffsets.add(new TermVectorOffsetInfo(fieldSetting.offset + token.startOffset(), fieldSetting.offset + token.endOffset())); - lastOffset = fieldSetting.offset + token.endOffset(); - } - - - } - - if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) { - fieldSetting.offset = lastOffset + 1; - } - - } - - - unflushedDocuments.add(document); - - // if too many documents in buffer, commit. - if (unflushedDocuments.size() >= getMergeFactor()) { - commit(/*lock*/); - } - - // todo: unlock write lock - - } - - - private Set unflushedDeletions = new HashSet(); - - public void deleteDocuments(Term term) throws IOException { - unflushedDeletions.add(term); - } - - public void deleteDocuments(Term[] terms) throws IOException { - for (Term term : terms) { - deleteDocuments(term); - } - } - - public void updateDocument(Term term, Document doc) throws IOException { - updateDocument(term, doc, getAnalyzer()); - } - - public void updateDocument(Term term, Document doc, Analyzer analyzer) throws IOException { - deleteDocuments(term); - addDocument(doc, analyzer); - } - - public int getMaxFieldLength() { - return maxFieldLength; - } - - public void setMaxFieldLength(int maxFieldLength) { - this.maxFieldLength = maxFieldLength; - } - - public Similarity getSimilarity() { - return similarity; - } - - public void setSimilarity(Similarity similarity) { - this.similarity = similarity; - } - - public Analyzer getAnalyzer() { - return analyzer; - } - - private class TermDocumentInformationFactory { - private LinkedList payloads = new LinkedList(); - private LinkedList termPositions = new LinkedList(); - private LinkedList termOffsets = new LinkedList(); - } - - - static class FieldSetting extends org.apache.lucene.store.instantiated.FieldSetting { - - float boost = 1; - int position = 0; - int offset; - int fieldLength = 0; - - boolean omitNorms = false; - boolean isBinary = false; - - private FieldSetting() { - } - - private FieldSetting(String fieldName) { - super(fieldName); - } - } - - -}