pylucene 3.5.0-3
[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / instantiated / src / java / org / apache / lucene / store / instantiated / InstantiatedIndexWriter.java
1 package org.apache.lucene.store.instantiated;
2
3 /**
4  * Copyright 2006 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  *     http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18
19 import java.io.IOException;
20 import java.io.PrintStream;
21 import java.io.StringReader;
22 import java.io.Closeable;
23 import java.util.ArrayList;
24 import java.util.Arrays;
25 import java.util.Collections;
26 import java.util.Comparator;
27 import java.util.HashMap;
28 import java.util.HashSet;
29 import java.util.Iterator;
30 import java.util.LinkedHashMap;
31 import java.util.LinkedList;
32 import java.util.List;
33 import java.util.Map;
34 import java.util.Set;
35
36 import org.apache.lucene.analysis.Analyzer;
37 import org.apache.lucene.analysis.Token;
38 import org.apache.lucene.analysis.TokenStream;
39 import org.apache.lucene.document.Document;
40 import org.apache.lucene.document.Fieldable;
41 import org.apache.lucene.index.FieldInvertState;
42 import org.apache.lucene.index.IndexReader;
43 import org.apache.lucene.index.IndexWriter;
44 import org.apache.lucene.index.Term;
45 import org.apache.lucene.index.TermVectorOffsetInfo;
46 import org.apache.lucene.search.Similarity;
47 import org.apache.lucene.util.StringHelper;
48 import org.apache.lucene.util.ArrayUtil;
49 import org.apache.lucene.util.CollectionUtil;
50 import org.apache.lucene.util.AttributeImpl;
51 import org.apache.lucene.util.BitVector;
52
53 /**
54  * This class, similar to {@link org.apache.lucene.index.IndexWriter}, has no locking mechanism.
55  * <p>
56  * {@link org.apache.lucene.store.instantiated.InstantiatedIndexReader} is navigating
57  * the same instances in memory as this writer is updating so searchers active while
58  * you are committing are bound to throw exceptions.
59  * <p>
60  * Consider using InstantiatedIndex as if it was immutable.
61  *
62  * @see org.apache.lucene.index.IndexWriter 
63  */
64 public class InstantiatedIndexWriter implements Closeable {
65
66   private PrintStream infoStream = null;
67
68   private int maxFieldLength = IndexWriter.DEFAULT_MAX_FIELD_LENGTH;
69
70   private final InstantiatedIndex index;
71   private final Analyzer analyzer;
72
73   private Similarity similarity = Similarity.getDefault(); // how to normalize;
74
75   private transient Set<String> fieldNameBuffer;
76   /**
77    * linked to ensure chronological order
78    */
79   private Map<InstantiatedDocument, Map<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>>> termDocumentInformationFactoryByDocument = new LinkedHashMap<InstantiatedDocument, Map<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>>>(2000);
80
81   private Set<InstantiatedDocument> unflushedDocuments = new HashSet<InstantiatedDocument>();
82
83   public InstantiatedIndexWriter(InstantiatedIndex index) throws IOException {
84     this(index, null);
85   }
86
87   public InstantiatedIndexWriter(InstantiatedIndex index, Analyzer analyzer) throws IOException {
88     this(index, analyzer, false);
89   }
90
91   public InstantiatedIndexWriter(InstantiatedIndex index, Analyzer analyzer, boolean create) throws IOException {
92     this.index = index;
93     this.analyzer = analyzer;
94     fieldNameBuffer = new HashSet<String>();
95     if (create) {
96       this.index.initialize();
97     }
98   }
99
100   private int mergeFactor = 2500;
101
102   /**
103    * The sweetspot for this implementation is somewhere around 2500 at 2K text large documents.
104    * <p/>
105    * Benchmark output:
106    * <pre>
107    *  ------------> Report sum by Prefix (MAddDocs) and Round (8 about 8 out of 160153)
108    *  Operation      round  mrg buf cmpnd   runCnt   recsPerRun        rec/s  elapsedSec    avgUsedMem    avgTotalMem
109    *  MAddDocs_20000     0   10  10  true        1        20000         81,4      245,68   200 325 152    268 156 928
110    *  MAddDocs_20000 -   1 1000  10  true -  -   1 -  -   20000 -  -   494,1 -  -  40,47 - 247 119 072 -  347 025 408
111    *  MAddDocs_20000     2   10 100  true        1        20000        104,8      190,81   233 895 552    363 720 704
112    *  MAddDocs_20000 -   3 2000 100  true -  -   1 -  -   20000 -  -   527,2 -  -  37,94 - 266 136 448 -  378 273 792
113    *  MAddDocs_20000     4   10  10 false        1        20000        103,2      193,75   222 089 792    378 273 792
114    *  MAddDocs_20000 -   5 3000  10 false -  -   1 -  -   20000 -  -   545,2 -  -  36,69 - 237 917 152 -  378 273 792
115    *  MAddDocs_20000     6   10 100 false        1        20000        102,7      194,67   237 018 976    378 273 792
116    *  MAddDocs_20000 -   7 4000 100 false -  -   1 -  -   20000 -  -   535,8 -  -  37,33 - 309 680 640 -  501 968 896
117    * </pre>
118    *
119    * @see org.apache.lucene.index.LogMergePolicy#setMergeFactor(int)
120    */
121   public void setMergeFactor(int mergeFactor) {
122     this.mergeFactor = mergeFactor;
123   }
124
125   /**
126    * @see org.apache.lucene.index.LogMergePolicy#getMergeFactor()
127    */
128   public int getMergeFactor() {
129     return mergeFactor;
130   }
131
132
133   /**
134    * If non-null, information about merges and a message when
135    * maxFieldLength is reached could be printed to this -- currently
136    * not yet implemented.
137    */
138   public void setInfoStream(PrintStream infoStream) {
139     this.infoStream = infoStream;
140   }
141
142
143   public void abort() throws IOException {
144     // what not
145   }
146
147
148   public void addIndexes(IndexReader[] readers) {
149     throw new RuntimeException("Not implemented");
150   }
151
152
153   public PrintStream getInfoStream() {
154     return infoStream;
155   }
156
157
158   /**
159    * Flushes all changes to an index and closes all associated files.
160    */
161   public void close() throws IOException {
162     commit();
163   }
164
165   /**
166    * Returns the number of documents currently in this index.
167    */
168   public int docCount() {
169     // todo: not certain. see http://www.nabble.com/IndexWriter.docCount-tf3128882.html#a8669483
170     return index.getDocumentsByNumber().length /* - index.getDeletedDocuments().size() */ + unflushedDocuments.size();
171   }
172
173   /**
174    * Locks the index and commits the buffered documents.
175    */
176   public void commit() throws IOException {
177
178     // todo write lock, unless held by caller
179
180     boolean orderedTermsDirty = false;
181     Set<InstantiatedTerm> dirtyTerms = new HashSet<InstantiatedTerm>(1000);
182     
183     Map<String, FieldSetting> fieldSettingsByFieldName = new HashMap<String, FieldSetting>();
184     for (String fieldName : fieldNameBuffer) {
185       fieldSettingsByFieldName.put(fieldName, new FieldSetting(fieldName));
186     }
187
188     InstantiatedDocument[] documentsByNumber = new InstantiatedDocument[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()];
189     System.arraycopy(index.getDocumentsByNumber(), 0, documentsByNumber, 0, index.getDocumentsByNumber().length);
190     int documentNumber = index.getDocumentsByNumber().length;
191
192     List<InstantiatedTerm> orderedTerms = new ArrayList<InstantiatedTerm>(index.getOrderedTerms().length + 5000);
193     for (InstantiatedTerm instantiatedTerm : index.getOrderedTerms()) {
194       orderedTerms.add(instantiatedTerm);
195     }
196
197     // update norm array with fake values for new documents
198     Map<String, byte[]> normsByFieldNameAndDocumentNumber = new HashMap<String, byte[]>(index.getTermsByFieldAndText().size());
199     Set<String> fieldNames = new HashSet<String>(20);
200     fieldNames.addAll(index.getNormsByFieldNameAndDocumentNumber().keySet());
201     fieldNames.addAll(fieldNameBuffer);
202     for (String field : index.getTermsByFieldAndText().keySet()) {
203       byte[] norms = new byte[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()];
204       byte[] oldNorms = index.getNormsByFieldNameAndDocumentNumber().get(field);
205       if (oldNorms != null) {
206         System.arraycopy(oldNorms, 0, norms, 0, oldNorms.length);
207         Arrays.fill(norms, oldNorms.length, norms.length, similarity.encodeNormValue(1.0f));
208       } else {
209         Arrays.fill(norms, 0, norms.length, similarity.encodeNormValue(1.0f));
210       }
211       normsByFieldNameAndDocumentNumber.put(field, norms);
212       fieldNames.remove(field);
213     }
214     for (String field : fieldNames) {
215       //System.out.println(field);
216       byte[] norms = new byte[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()];
217       Arrays.fill(norms, 0, norms.length, similarity.encodeNormValue(1.0f));
218       normsByFieldNameAndDocumentNumber.put(field, norms);
219     }
220     fieldNames.clear();
221     index.setNormsByFieldNameAndDocumentNumber(normsByFieldNameAndDocumentNumber);
222
223     for (Map.Entry<InstantiatedDocument, Map<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>>> eDocumentTermDocInfoByTermTextAndField : termDocumentInformationFactoryByDocument.entrySet()) {
224
225       InstantiatedDocument document = eDocumentTermDocInfoByTermTextAndField.getKey();
226
227       // assign document number
228       document.setDocumentNumber(documentNumber++);
229       documentsByNumber[document.getDocumentNumber()] = document;
230
231       // set norms, prepare document and create optimized size collections.
232
233       int numFieldsWithTermVectorsInDocument = 0;
234       int termsInDocument = 0;
235       for (Map.Entry<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>> eFieldTermDocInfoFactoriesByTermText : eDocumentTermDocInfoByTermTextAndField.getValue().entrySet()) {
236         if (eFieldTermDocInfoFactoriesByTermText.getKey().storeTermVector) {
237           numFieldsWithTermVectorsInDocument += eFieldTermDocInfoFactoriesByTermText.getValue().size();
238         }
239         termsInDocument += eFieldTermDocInfoFactoriesByTermText.getValue().size();
240
241         if (eFieldTermDocInfoFactoriesByTermText.getKey().indexed && !eFieldTermDocInfoFactoriesByTermText.getKey().omitNorms) {
242           final FieldInvertState invertState = new FieldInvertState();
243           invertState.setBoost(eFieldTermDocInfoFactoriesByTermText.getKey().boost * document.getDocument().getBoost());
244           invertState.setLength(eFieldTermDocInfoFactoriesByTermText.getKey().fieldLength);
245           final float norm = similarity.computeNorm(eFieldTermDocInfoFactoriesByTermText.getKey().fieldName, invertState);
246           normsByFieldNameAndDocumentNumber.get(eFieldTermDocInfoFactoriesByTermText.getKey().fieldName)[document.getDocumentNumber()] = similarity.encodeNormValue(norm);
247         } else {
248           System.currentTimeMillis();
249         }
250
251       }
252
253       /** used for term vectors only, i think.. */
254       Map<InstantiatedTerm, InstantiatedTermDocumentInformation> informationByTermOfCurrentDocument = new HashMap<InstantiatedTerm, InstantiatedTermDocumentInformation>(termsInDocument);
255
256
257       Map<String, FieldSetting> documentFieldSettingsByFieldName = new HashMap<String, FieldSetting>(eDocumentTermDocInfoByTermTextAndField.getValue().size());
258
259       // terms...
260       for (Map.Entry<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>> eFieldSetting_TermDocInfoFactoriesByTermText : eDocumentTermDocInfoByTermTextAndField.getValue().entrySet()) {
261         documentFieldSettingsByFieldName.put(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eFieldSetting_TermDocInfoFactoriesByTermText.getKey());
262
263         // find or create term
264         for (Map.Entry<String /*text*/, TermDocumentInformationFactory> eTermText_TermDocInfoFactory : eFieldSetting_TermDocInfoFactoriesByTermText.getValue().entrySet()) {
265
266           // get term..
267           InstantiatedTerm term;
268           Map<String, InstantiatedTerm> termsByText = index.getTermsByFieldAndText().get(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName);
269           if (termsByText == null) {
270             termsByText = new HashMap<String, InstantiatedTerm>(1000);
271             index.getTermsByFieldAndText().put(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, termsByText);
272             term = new InstantiatedTerm(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eTermText_TermDocInfoFactory.getKey());
273             termsByText.put(eTermText_TermDocInfoFactory.getKey(), term);
274             int pos = Collections.binarySearch(orderedTerms, term, InstantiatedTerm.comparator);
275             pos = -1 - pos;
276             orderedTerms.add(pos, term);
277             orderedTermsDirty = true;
278           } else {
279             term = termsByText.get(eTermText_TermDocInfoFactory.getKey());
280             if (term == null) {
281               term = new InstantiatedTerm(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eTermText_TermDocInfoFactory.getKey());
282               termsByText.put(eTermText_TermDocInfoFactory.getKey(), term);
283               int pos = Collections.binarySearch(orderedTerms, term, InstantiatedTerm.comparator);
284               pos = -1 - pos;
285               orderedTerms.add(pos, term);
286               orderedTermsDirty = true;
287             }
288           }
289
290           // create association term document information
291           //
292           // [Term]-- {0..*} | {0..* ordered} --(field)[Document]
293           //
294           //                 |
295           //        [TermDocumentInformation]
296
297           int[] positions = new int[eTermText_TermDocInfoFactory.getValue().termPositions.size()];
298           for (int i = 0; i < positions.length; i++) {
299             positions[i] = eTermText_TermDocInfoFactory.getValue().termPositions.get(i);
300           }
301
302           byte[][] payloads = new byte[eTermText_TermDocInfoFactory.getValue().payloads.size()][];
303           for (int i = 0; i < payloads.length; i++) {
304             payloads[i] = eTermText_TermDocInfoFactory.getValue().payloads.get(i);
305           }
306
307           // couple
308
309           InstantiatedTermDocumentInformation info = new InstantiatedTermDocumentInformation(term, document, /*eTermText_TermDocInfoFactory.getValue().termFrequency,*/ positions, payloads);
310
311           // todo optimize, this should be cached and updated to array in batches rather than appending the array once for every position!
312           InstantiatedTermDocumentInformation[] associatedDocuments;
313           if (term.getAssociatedDocuments() != null) {
314             associatedDocuments = new InstantiatedTermDocumentInformation[term.getAssociatedDocuments().length + 1];
315             System.arraycopy(term.getAssociatedDocuments(), 0, associatedDocuments, 0, term.getAssociatedDocuments().length);
316           } else {
317             associatedDocuments = new InstantiatedTermDocumentInformation[1];
318           }
319           associatedDocuments[associatedDocuments.length - 1] = info;          
320           term.setAssociatedDocuments(associatedDocuments);
321
322           // todo optimize, only if term vector?
323           informationByTermOfCurrentDocument.put(term, info);
324
325
326           dirtyTerms.add(term);
327         }
328
329         // term vector offsets
330         if (eFieldSetting_TermDocInfoFactoriesByTermText.getKey().storeOffsetWithTermVector) {
331           for (Map.Entry<InstantiatedTerm, InstantiatedTermDocumentInformation> e : informationByTermOfCurrentDocument.entrySet()) {
332             if (eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName.equals(e.getKey().field())) {
333               TermDocumentInformationFactory factory = eFieldSetting_TermDocInfoFactoriesByTermText.getValue().get(e.getKey().text());
334               e.getValue().setTermOffsets(factory.termOffsets.toArray(new TermVectorOffsetInfo[factory.termOffsets.size()]));
335             }
336           }
337         }
338       }
339
340       Map<String, List<InstantiatedTermDocumentInformation>> termDocumentInformationsByField = new HashMap<String, List<InstantiatedTermDocumentInformation>>();
341       for (Map.Entry<InstantiatedTerm, InstantiatedTermDocumentInformation> eTerm_TermDocumentInformation : informationByTermOfCurrentDocument.entrySet()) {
342         List<InstantiatedTermDocumentInformation> termDocumentInformations = termDocumentInformationsByField.get(eTerm_TermDocumentInformation.getKey().field());
343         if (termDocumentInformations == null) {
344           termDocumentInformations = new ArrayList<InstantiatedTermDocumentInformation>();
345           termDocumentInformationsByField.put(eTerm_TermDocumentInformation.getKey().field(), termDocumentInformations);
346         }
347         termDocumentInformations.add(eTerm_TermDocumentInformation.getValue());
348       }
349
350       for (Map.Entry<String, List<InstantiatedTermDocumentInformation>> eField_TermDocInfos : termDocumentInformationsByField.entrySet()) {
351
352         CollectionUtil.quickSort(eField_TermDocInfos.getValue(), tdComp);
353
354         // add term vector
355         if (documentFieldSettingsByFieldName.get(eField_TermDocInfos.getKey()).storeTermVector) {
356           if (document.getVectorSpace() == null) {
357             document.setVectorSpace(new HashMap<String, List<InstantiatedTermDocumentInformation>>(documentFieldSettingsByFieldName.size()));
358           }
359           document.getVectorSpace().put(eField_TermDocInfos.getKey(), eField_TermDocInfos.getValue());
360         }
361
362       }
363       fieldSettingsByFieldName.putAll(documentFieldSettingsByFieldName);
364     }
365
366     // order document informations in dirty terms
367     for (InstantiatedTerm term : dirtyTerms) {
368       // todo optimize, i believe this is useless, that the natural order is document number?
369       ArrayUtil.mergeSort(term.getAssociatedDocuments(), InstantiatedTermDocumentInformation.documentNumberComparator);
370
371 //      // update association class reference for speedy skipTo()
372 //      for (int i = 0; i < term.getAssociatedDocuments().length; i++) {
373 //        term.getAssociatedDocuments()[i].setIndexFromTerm(i);
374 //      }
375     }
376
377
378     // flush to writer
379     index.setDocumentsByNumber(documentsByNumber);
380     index.setOrderedTerms(orderedTerms.toArray(new InstantiatedTerm[orderedTerms.size()]));
381
382     for (FieldSetting fieldSetting : fieldSettingsByFieldName.values()) {
383       index.getFieldSettings().merge(fieldSetting);
384     }
385     // set term index
386     if (orderedTermsDirty) {
387       // todo optimize, only update from start position
388       for (int i = 0; i < index.getOrderedTerms().length; i++) {
389         index.getOrderedTerms()[i].setTermIndex(i);
390       }
391
392     }
393
394     // remove deleted documents
395     IndexReader indexDeleter = index.indexReaderFactory();
396     if (unflushedDeletions.size() > 0) {
397       for (Term term : unflushedDeletions) {
398         indexDeleter.deleteDocuments(term);
399       }
400       unflushedDeletions.clear();
401     }
402
403
404     // all done, clear buffers
405     unflushedDocuments.clear();
406     termDocumentInformationFactoryByDocument.clear();
407     fieldNameBuffer.clear();
408
409
410     // update deleted documents bitset
411     if (index.getDeletedDocuments() != null) {
412       BitVector deletedDocuments = new BitVector(index.getDocumentsByNumber().length);
413       for (int i = 0; i < index.getDeletedDocuments().size(); i++) {
414         if (index.getDeletedDocuments().get(i)) {
415           deletedDocuments.set(i);
416         }
417       }
418       index.setDeletedDocuments(deletedDocuments);
419     }
420
421     index.setVersion(System.currentTimeMillis());
422
423     // todo unlock
424
425     indexDeleter.close();
426
427   }
428
429   private static final Comparator<InstantiatedTermDocumentInformation> tdComp = new Comparator<InstantiatedTermDocumentInformation>() {
430     public int compare(InstantiatedTermDocumentInformation instantiatedTermDocumentInformation, InstantiatedTermDocumentInformation instantiatedTermDocumentInformation1) {
431       return instantiatedTermDocumentInformation.getTerm().getTerm().compareTo(instantiatedTermDocumentInformation1.getTerm().getTerm());
432     }
433   };
434
435   /**
436    * Adds a document to this index.  If the document contains more than
437    * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
438    * discarded.
439    */
440   public void addDocument(Document doc) throws IOException {
441     addDocument(doc, getAnalyzer());
442   }
443
444   /**
445    * Adds a document to this index, using the provided analyzer instead of the
446    * value of {@link #getAnalyzer()}.  If the document contains more than
447    * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
448    * discarded.
449    *
450    * @param doc
451    * @param analyzer
452    * @throws IOException
453    */
454   public void addDocument(Document doc, Analyzer analyzer) throws IOException {
455     addDocument(new InstantiatedDocument(doc), analyzer);
456   }
457
458   /**
459    * Tokenizes a document and adds it to the buffer.
460    * Try to do all calculations in this method rather than in commit, as this is a non locking method.
461    * Remember, this index implementation expects unlimited memory for maximum speed.
462    *
463    * @param document
464    * @param analyzer
465    * @throws IOException
466    */
467   protected void addDocument(InstantiatedDocument document, Analyzer analyzer) throws IOException {
468
469     if (document.getDocumentNumber() != null) {
470       throw new RuntimeException("Document number already set! Are you trying to add a document that already is bound to this or another index?");
471     }
472
473     // todo: write lock
474
475     // normalize settings per field name in document
476
477     Map<String /* field name */, FieldSetting> fieldSettingsByFieldName = new HashMap<String, FieldSetting>();
478     for (Fieldable field : document.getDocument().getFields()) {
479       FieldSetting fieldSetting = fieldSettingsByFieldName.get(field.name());
480       if (fieldSetting == null) {
481         fieldSetting = new FieldSetting();
482         fieldSetting.fieldName = StringHelper.intern(field.name());
483         fieldSettingsByFieldName.put(fieldSetting.fieldName, fieldSetting);
484         fieldNameBuffer.add(fieldSetting.fieldName);
485       }
486
487       // todo: fixme: multiple fields with the same name does not mean field boost += more boost.
488       fieldSetting.boost *= field.getBoost();
489       //fieldSettings.dimensions++;
490
491
492       // once fieldSettings, always fieldSettings.
493       if (field.getOmitNorms()) {
494         fieldSetting.omitNorms = true;
495       }
496       if (field.isIndexed() ) {
497         fieldSetting.indexed = true;
498       }
499       if (field.isTokenized()) {
500         fieldSetting.tokenized = true;
501       }
502       if (field.isStored()) {
503         fieldSetting.stored = true;
504       }
505       if (field.isBinary()) {
506         fieldSetting.isBinary = true;
507       }
508       if (field.isTermVectorStored()) {
509         fieldSetting.storeTermVector = true;
510       }
511       if (field.isStorePositionWithTermVector()) {
512         fieldSetting.storePositionWithTermVector = true;
513       }
514       if (field.isStoreOffsetWithTermVector()) {
515         fieldSetting.storeOffsetWithTermVector = true;
516       }
517     }
518
519     Map<Fieldable, LinkedList<Token>> tokensByField = new LinkedHashMap<Fieldable, LinkedList<Token>>(20);
520
521     // tokenize indexed fields.
522     for (Iterator<Fieldable> it = document.getDocument().getFields().iterator(); it.hasNext();) {
523
524       Fieldable field = it.next();
525
526       FieldSetting fieldSetting = fieldSettingsByFieldName.get(field.name());
527
528       if (field.isIndexed()) {
529
530         LinkedList<Token> tokens = new LinkedList<Token>();
531         tokensByField.put(field, tokens);
532
533         if (field.isTokenized()) {
534           final TokenStream tokenStream;
535           // todo readerValue(), binaryValue()
536           if (field.tokenStreamValue() != null) {
537             tokenStream = field.tokenStreamValue();
538           } else {
539             tokenStream = analyzer.reusableTokenStream(field.name(), new StringReader(field.stringValue()));
540           }
541
542           // reset the TokenStream to the first token          
543           tokenStream.reset();
544
545           while (tokenStream.incrementToken()) {
546             // TODO: this is a simple workaround to still work with tokens, not very effective, but as far as I know, this writer should get removed soon:
547             final Token token = new Token();
548             for (Iterator<AttributeImpl> atts = tokenStream.getAttributeImplsIterator(); atts.hasNext();) {
549               final AttributeImpl att = atts.next();
550               try {
551                 att.copyTo(token);
552               } catch (Exception e) {
553                 // ignore unsupported attributes,
554                 // this may fail to copy some attributes, if a special combined AttributeImpl is used, that
555                 // implements basic attributes supported by Token and also other customized ones in one class.
556               }
557             }
558             tokens.add(token); // the vector will be built on commit.
559             fieldSetting.fieldLength++;
560             if (fieldSetting.fieldLength > maxFieldLength) {
561               break;
562             }
563           }
564           tokenStream.end();
565           tokenStream.close();
566         } else {
567           // untokenized
568           String fieldVal = field.stringValue();
569           Token token = new Token(0, fieldVal.length(), "untokenized");
570           token.setEmpty().append(fieldVal);
571           tokens.add(token);
572           fieldSetting.fieldLength++;
573         }
574       }
575
576       if (!field.isStored()) {
577         it.remove();
578       }
579     }
580
581
582     Map<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>> termDocumentInformationFactoryByTermTextAndFieldSetting = new HashMap<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>>();
583     termDocumentInformationFactoryByDocument.put(document, termDocumentInformationFactoryByTermTextAndFieldSetting);
584
585     // build term vector, term positions and term offsets
586     for (Map.Entry<Fieldable, LinkedList<Token>> eField_Tokens : tokensByField.entrySet()) {
587       FieldSetting fieldSetting = fieldSettingsByFieldName.get(eField_Tokens.getKey().name());
588
589       Map<String, TermDocumentInformationFactory> termDocumentInformationFactoryByTermText = termDocumentInformationFactoryByTermTextAndFieldSetting.get(fieldSettingsByFieldName.get(eField_Tokens.getKey().name()));
590       if (termDocumentInformationFactoryByTermText == null) {
591         termDocumentInformationFactoryByTermText = new HashMap<String /*text*/, TermDocumentInformationFactory>();
592         termDocumentInformationFactoryByTermTextAndFieldSetting.put(fieldSettingsByFieldName.get(eField_Tokens.getKey().name()), termDocumentInformationFactoryByTermText);
593       }
594
595       int lastOffset = 0;
596
597       // for each new field, move positions a bunch.
598       if (fieldSetting.position > 0) {
599         // todo what if no analyzer set, multiple fields with same name and index without tokenization?
600         fieldSetting.position += analyzer.getPositionIncrementGap(fieldSetting.fieldName);
601       }
602
603       for (Token token : eField_Tokens.getValue()) {
604
605         TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.toString());
606         if (termDocumentInformationFactory == null) {
607           termDocumentInformationFactory = new TermDocumentInformationFactory();
608           termDocumentInformationFactoryByTermText.put(token.toString(), termDocumentInformationFactory);
609         }
610         //termDocumentInformationFactory.termFrequency++;
611
612         fieldSetting.position += (token.getPositionIncrement() - 1);
613         termDocumentInformationFactory.termPositions.add(fieldSetting.position++);
614
615         if (token.getPayload() != null && token.getPayload().length() > 0) {
616           termDocumentInformationFactory.payloads.add(token.getPayload().toByteArray());
617           fieldSetting.storePayloads = true;
618         } else {
619           termDocumentInformationFactory.payloads.add(null);
620         }
621
622         if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) {
623
624           termDocumentInformationFactory.termOffsets.add(new TermVectorOffsetInfo(fieldSetting.offset + token.startOffset(), fieldSetting.offset + token.endOffset()));
625           lastOffset = fieldSetting.offset + token.endOffset();
626         }
627
628
629       }
630
631       if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) {
632         fieldSetting.offset = lastOffset + 1;
633       }
634
635     }
636
637
638     unflushedDocuments.add(document);
639
640     // if too many documents in buffer, commit.
641     if (unflushedDocuments.size() >= getMergeFactor()) {
642       commit(/*lock*/);
643     }
644
645     // todo: unlock write lock
646
647   }
648
649
650   private Set<Term> unflushedDeletions = new HashSet<Term>();
651
652   public void deleteDocuments(Term term) throws IOException {
653     unflushedDeletions.add(term);
654   }
655
656   public void deleteDocuments(Term[] terms) throws IOException {
657     for (Term term : terms) {
658       deleteDocuments(term);
659     }
660   }
661
662   public void updateDocument(Term term, Document doc) throws IOException {
663     updateDocument(term, doc, getAnalyzer());
664   }
665
666   public void updateDocument(Term term, Document doc, Analyzer analyzer) throws IOException {
667     deleteDocuments(term);
668     addDocument(doc, analyzer);
669   }
670
671   public int getMaxFieldLength() {
672     return maxFieldLength;
673   }
674
675   public void setMaxFieldLength(int maxFieldLength) {
676     this.maxFieldLength = maxFieldLength;
677   }
678
679   public Similarity getSimilarity() {
680     return similarity;
681   }
682
683   public void setSimilarity(Similarity similarity) {
684     this.similarity = similarity;
685   }
686
687   public Analyzer getAnalyzer() {
688     return analyzer;
689   }
690
691   private class TermDocumentInformationFactory {
692     private LinkedList<byte[]> payloads = new LinkedList<byte[]>();
693     private LinkedList<Integer> termPositions = new LinkedList<Integer>();
694     private LinkedList<TermVectorOffsetInfo> termOffsets = new LinkedList<TermVectorOffsetInfo>();
695   }
696
697
698   static class FieldSetting extends org.apache.lucene.store.instantiated.FieldSetting {
699
700     float boost = 1;
701     int position = 0;
702     int offset;
703     int fieldLength = 0;
704
705     boolean omitNorms = false;
706     boolean isBinary = false;
707
708     private FieldSetting() {
709     }
710
711     private FieldSetting(String fieldName) {
712       super(fieldName);
713     }
714   }
715
716
717 }