lucene-java-3.4.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/LuceneTaxonomyWriter.java

   1 package org.apache.lucene.facet.taxonomy.lucene;
   2
   3 import java.io.BufferedInputStream;
   4 import java.io.BufferedOutputStream;
   5 import java.io.DataInputStream;
   6 import java.io.DataOutputStream;
   7 import java.io.File;
   8 import java.io.FileInputStream;
   9 import java.io.FileNotFoundException;
  10 import java.io.FileOutputStream;
  11 import java.io.IOException;
  12 import java.util.Map;
  13
  14 import org.apache.lucene.analysis.KeywordAnalyzer;
  15 import org.apache.lucene.analysis.TokenStream;
  16 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  17 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  18 import org.apache.lucene.document.Document;
  19 import org.apache.lucene.document.Field;
  20 import org.apache.lucene.document.Field.Index;
  21 import org.apache.lucene.document.Field.Store;
  22 import org.apache.lucene.index.CorruptIndexException;
  23 import org.apache.lucene.index.FieldInfo.IndexOptions;
  24 import org.apache.lucene.index.IndexReader;
  25 import org.apache.lucene.index.IndexWriter;
  26 import org.apache.lucene.index.IndexWriterConfig;
  27 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
  28 import org.apache.lucene.index.LogByteSizeMergePolicy;
  29 import org.apache.lucene.index.Term;
  30 import org.apache.lucene.index.TermDocs;
  31 import org.apache.lucene.index.TermEnum;
  32 import org.apache.lucene.store.Directory;
  33 import org.apache.lucene.store.LockObtainFailedException;
  34 import org.apache.lucene.store.NativeFSLockFactory;
  35 import org.apache.lucene.store.SimpleFSLockFactory;
  36 import org.apache.lucene.util.Version;
  37
  38 import org.apache.lucene.facet.taxonomy.CategoryPath;
  39 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
  40 import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
  41 import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
  42 import org.apache.lucene.facet.taxonomy.writercache.cl2o.Cl2oTaxonomyWriterCache;
  43 import org.apache.lucene.facet.taxonomy.writercache.lru.LruTaxonomyWriterCache;
  44
  45 /**
  46  * Licensed to the Apache Software Foundation (ASF) under one or more
  47  * contributor license agreements.  See the NOTICE file distributed with
  48  * this work for additional information regarding copyright ownership.
  49  * The ASF licenses this file to You under the Apache License, Version 2.0
  50  * (the "License"); you may not use this file except in compliance with
  51  * the License.  You may obtain a copy of the License at
  52  *
  53  *     http://www.apache.org/licenses/LICENSE-2.0
  54  *
  55  * Unless required by applicable law or agreed to in writing, software
  56  * distributed under the License is distributed on an "AS IS" BASIS,
  57  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  58  * See the License for the specific language governing permissions and
  59  * limitations under the License.
  60  */
  61
  62 /**
  63  * {@link TaxonomyWriter} which uses a Lucene index to store the taxonomy
  64  * information on disk, and keeps an additional in-memory cache of some or all
  65  * categories.
  66  * <P>
  67  * By using a Lucene index to store the information on disk, rather than some
  68  * specialized file format, we get for "free" Lucene's correctness (especially
  69  * regarding multi-process concurrency), and the ability to write to any
  70  * implementation of Directory (and not just the file system).
  71  * <P>
  72  * In addition to the permanently-stored Lucene index, efficiency dictates that
  73  * we also keep an in-memory cache of <B>recently seen</B> or <B>all</B>
  74  * categories, so that we do not need to go back to disk for every category
  75  * addition to see which ordinal this category already has, if any. A
  76  * {@link TaxonomyWriterCache} object determines the specific caching algorithm
  77  * used.
  78  * <p>
  79  * This class offers some hooks for extending classes to control the
  80  * {@link IndexWriter} instance that is used. See {@link #openLuceneIndex} and
  81  * {@link #closeLuceneIndex()} .
  82  *
  83  * @lucene.experimental
  84  */
  85 public class LuceneTaxonomyWriter implements TaxonomyWriter {
  86
  87   protected IndexWriter indexWriter;
  88   private int nextID;
  89   private char delimiter = Consts.DEFAULT_DELIMITER;
  90   private SinglePositionTokenStream parentStream = new SinglePositionTokenStream(Consts.PAYLOAD_PARENT);
  91   private Field parentStreamField;
  92   private Field fullPathField;
  93
  94   private TaxonomyWriterCache cache;
  95   /**
  96    * We call the cache "complete" if we know that every category in our
  97    * taxonomy is in the cache. When the cache is <B>not</B> complete, and
  98    * we can't find a category in the cache, we still need to look for it
  99    * in the on-disk index; Therefore when the cache is not complete, we
 100    * need to open a "reader" to the taxonomy index.
 101    * The cache becomes incomplete if it was never filled with the existing
 102    * categories, or if a put() to the cache ever returned true (meaning
 103    * that some of the cached data was cleared).
 104    */
 105   private boolean cacheIsComplete;
 106   private IndexReader reader;
 107   private int cacheMisses;
 108
 109   /**
 110    * setDelimiter changes the character that the taxonomy uses in its internal
 111    * storage as a delimiter between category components. Do not use this
 112    * method unless you really know what you are doing. It has nothing to do
 113    * with whatever character the application may be using to represent
 114    * categories for its own use.
 115    * <P>
 116    * If you do use this method, make sure you call it before any other methods
 117    * that actually queries the taxonomy. Moreover, make sure you always pass
 118    * the same delimiter for all LuceneTaxonomyWriter and LuceneTaxonomyReader
 119    * objects you create for the same directory.
 120    */
 121   public void setDelimiter(char delimiter) {
 122     this.delimiter = delimiter;
 123   }
 124
 125   /**
 126    * Forcibly unlocks the taxonomy in the named directory.
 127    * <P>
 128    * Caution: this should only be used by failure recovery code, when it is
 129    * known that no other process nor thread is in fact currently accessing
 130    * this taxonomy.
 131    * <P>
 132    * This method is unnecessary if your {@link Directory} uses a
 133    * {@link NativeFSLockFactory} instead of the default
 134    * {@link SimpleFSLockFactory}. When the "native" lock is used, a lock
 135    * does not stay behind forever when the process using it dies.
 136    */
 137   public static void unlock(Directory directory) throws IOException {
 138     IndexWriter.unlock(directory);
 139   }
 140
 141   /**
 142    * Construct a Taxonomy writer.
 143    *
 144    * @param directory
 145    *    The {@link Directory} in which to store the taxonomy. Note that
 146    *    the taxonomy is written directly to that directory (not to a
 147    *    subdirectory of it).
 148    * @param openMode
 149    *    Specifies how to open a taxonomy for writing: <code>APPEND</code>
 150    *    means open an existing index for append (failing if the index does
 151    *    not yet exist). <code>CREATE</code> means create a new index (first
 152    *    deleting the old one if it already existed).
 153    *    <code>APPEND_OR_CREATE</code> appends to an existing index if there
 154    *    is one, otherwise it creates a new index.
 155    * @param cache
 156    *    A {@link TaxonomyWriterCache} implementation which determines
 157    *    the in-memory caching policy. See for example
 158    *    {@link LruTaxonomyWriterCache} and {@link Cl2oTaxonomyWriterCache}.
 159    *    If null or missing, {@link #defaultTaxonomyWriterCache()} is used.
 160    * @throws CorruptIndexException
 161    *     if the taxonomy is corrupted.
 162    * @throws LockObtainFailedException
 163    *     if the taxonomy is locked by another writer. If it is known
 164    *     that no other concurrent writer is active, the lock might
 165    *     have been left around by an old dead process, and should be
 166    *     removed using {@link #unlock(Directory)}.
 167    * @throws IOException
 168    *     if another error occurred.
 169    */
 170   public LuceneTaxonomyWriter(Directory directory, OpenMode openMode,
 171                               TaxonomyWriterCache cache)
 172   throws CorruptIndexException, LockObtainFailedException,
 173   IOException {
 174
 175     openLuceneIndex(directory, openMode);
 176     reader = null;
 177
 178     parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream);
 179     parentStreamField.setOmitNorms(true);
 180     fullPathField = new Field(Consts.FULL, "", Store.YES, Index.NOT_ANALYZED_NO_NORMS);
 181     fullPathField.setIndexOptions(IndexOptions.DOCS_ONLY);
 182
 183     this.nextID = indexWriter.maxDoc();
 184
 185     if (cache==null) {
 186       cache = defaultTaxonomyWriterCache();
 187     }
 188     this.cache = cache;
 189
 190     if (nextID == 0) {
 191       cacheIsComplete = true;
 192       // Make sure that the taxonomy always contain the root category
 193       // with category id 0.
 194       addCategory(new CategoryPath());
 195       refreshReader();
 196     } else {
 197       // There are some categories on the disk, which we have not yet
 198       // read into the cache, and therefore the cache is incomplete.
 199       // We chose not to read all the categories into the cache now,
 200       // to avoid terrible performance when a taxonomy index is opened
 201       // to add just a single category. We will do it later, after we
 202       // notice a few cache misses.
 203       cacheIsComplete = false;
 204     }
 205     cacheMisses = 0;
 206   }
 207
 208   /**
 209    * A hook for extensions of this class to provide their own
 210    * {@link IndexWriter} implementation or instance. Extending classes can
 211    * instantiate and configure the {@link IndexWriter} as they see fit,
 212    * including setting a {@link org.apache.lucene.index.MergeScheduler}, or
 213    * {@link org.apache.lucene.index.IndexDeletionPolicy}, different RAM size
 214    * etc.<br>
 215    * <b>NOTE:</b> the instance this method returns will be closed upon calling
 216    * to {@link #close()}. If you wish to do something different, you should
 217    * override {@link #closeLuceneIndex()}.
 218    *
 219    * @param directory the {@link Directory} on top of wich an
 220    *        {@link IndexWriter} should be opened.
 221    * @param openMode see {@link OpenMode}
 222    */
 223   protected void openLuceneIndex (Directory directory, OpenMode openMode)
 224   throws CorruptIndexException, LockObtainFailedException, IOException {
 225     // Make sure we use a MergePolicy which merges segments in-order and thus
 226     // keeps the doc IDs ordered as well (this is crucial for the taxonomy
 227     // index).
 228     IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_30,
 229         new KeywordAnalyzer()).setOpenMode(openMode).setMergePolicy(
 230         new LogByteSizeMergePolicy());
 231     indexWriter = new IndexWriter(directory, config);
 232   }
 233
 234   // Currently overridden by a unit test that verifies that every index we open
 235   // is close()ed.
 236   /**
 237    * Open an {@link IndexReader} from the {@link #indexWriter} member, by
 238    * calling {@link IndexWriter#getReader()}. Extending classes can override
 239    * this method to return their own {@link IndexReader}.
 240    */
 241   protected IndexReader openReader() throws IOException {
 242     return IndexReader.open(indexWriter, true);
 243   }
 244
 245   /**
 246    * Creates a new instance with a default cached as defined by
 247    * {@link #defaultTaxonomyWriterCache()}.
 248    */
 249   public LuceneTaxonomyWriter(Directory directory, OpenMode openMode)
 250   throws CorruptIndexException, LockObtainFailedException, IOException {
 251     this(directory, openMode, defaultTaxonomyWriterCache());
 252   }
 253
 254   /**
 255    * Defines the default {@link TaxonomyWriterCache} to use in constructors
 256    * which do not specify one.
 257    * <P>
 258    * The current default is {@link Cl2oTaxonomyWriterCache} constructed
 259    * with the parameters (1024, 0.15f, 3), i.e., the entire taxonomy is
 260    * cached in memory while building it.
 261    */
 262   public static TaxonomyWriterCache defaultTaxonomyWriterCache() {
 263     return new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
 264   }
 265
 266   // convenience constructors:
 267
 268   public LuceneTaxonomyWriter(Directory d)
 269   throws CorruptIndexException, LockObtainFailedException,
 270   IOException {
 271     this(d, OpenMode.CREATE_OR_APPEND);
 272   }
 273
 274   /**
 275    * Frees used resources as well as closes the underlying {@link IndexWriter},
 276    * which commits whatever changes made to it to the underlying
 277    * {@link Directory}.
 278    */
 279   public synchronized void close() throws CorruptIndexException, IOException {
 280     closeLuceneIndex();
 281     closeResources();
 282   }
 283
 284   /**
 285    * Returns the number of memory bytes used by the cache.
 286    * @return Number of cache bytes in memory, for CL2O only; zero otherwise.
 287    */
 288   public int getCacheMemoryUsage() {
 289     if (this.cache == null || !(this.cache instanceof Cl2oTaxonomyWriterCache)) {
 290       return 0;
 291     }
 292     return ((Cl2oTaxonomyWriterCache)this.cache).getMemoryUsage();
 293   }
 294
 295   /**
 296    * A hook for extending classes to close additional resources that were used.
 297    * The default implementation closes the {@link IndexReader} as well as the
 298    * {@link TaxonomyWriterCache} instances that were used. <br>
 299    * <b>NOTE:</b> if you override this method, you should include a
 300    * <code>super.closeResources()</code> call in your implementation.
 301    */
 302   protected synchronized void closeResources() throws IOException {
 303     if (reader != null) {
 304       reader.close();
 305       reader = null;
 306     }
 307     if (cache != null) {
 308       cache.close();
 309       cache = null;
 310     }
 311   }
 312
 313   /**
 314    * A hook for extending classes to control closing the {@link IndexWriter}
 315    * returned by {@link #openLuceneIndex}.
 316    */
 317   protected void closeLuceneIndex() throws CorruptIndexException, IOException {
 318     if (indexWriter != null) {
 319       indexWriter.close();
 320       indexWriter = null;
 321     }
 322   }
 323
 324   /**
 325    * Look up the given category in the cache and/or the on-disk storage,
 326    * returning the category's ordinal, or a negative number in case the
 327    * category does not yet exist in the taxonomy.
 328    */
 329   protected int findCategory(CategoryPath categoryPath) throws IOException {
 330     // If we can find the category in our cache, we can return the
 331     // response directly from it:
 332     int res = cache.get(categoryPath);
 333     if (res >= 0) {
 334       return res;
 335     }
 336     // If we know that the cache is complete, i.e., contains every category
 337     // which exists, we can return -1 immediately. However, if the cache is
 338     // not complete, we need to check the disk.
 339     if (cacheIsComplete) {
 340       return -1;
 341     }
 342     cacheMisses++;
 343     // After a few cache misses, it makes sense to read all the categories
 344     // from disk and into the cache. The reason not to do this on the first
 345     // cache miss (or even when opening the writer) is that it will
 346     // significantly slow down the case when a taxonomy is opened just to
 347     // add one category. The idea only spending a long time on reading
 348     // after enough time was spent on cache misses is known as a "online
 349     // algorithm".
 350     if (perhapsFillCache()) {
 351       return cache.get(categoryPath);
 352     }
 353
 354     // We need to get an answer from the on-disk index. If a reader
 355     // is not yet open, do it now:
 356     if (reader == null) {
 357       reader = openReader();
 358     }
 359
 360     TermDocs docs = reader.termDocs(new Term(Consts.FULL, categoryPath
 361         .toString(delimiter)));
 362     if (!docs.next()) {
 363       return -1; // category does not exist in taxonomy
 364     }
 365     // Note: we do NOT add to the cache the fact that the category
 366     // does not exist. The reason is that our only use for this
 367     // method is just before we actually add this category. If
 368     // in the future this usage changes, we should consider caching
 369     // the fact that the category is not in the taxonomy.
 370     addToCache(categoryPath, docs.doc());
 371     return docs.doc();
 372   }
 373
 374   /**
 375    * Look up the given prefix of the given category in the cache and/or the
 376    * on-disk storage, returning that prefix's ordinal, or a negative number in
 377    * case the category does not yet exist in the taxonomy.
 378    */
 379   private int findCategory(CategoryPath categoryPath, int prefixLen)
 380   throws IOException {
 381     int res = cache.get(categoryPath, prefixLen);
 382     if (res >= 0) {
 383       return res;
 384     }
 385     if (cacheIsComplete) {
 386       return -1;
 387     }
 388     cacheMisses++;
 389     if (perhapsFillCache()) {
 390       return cache.get(categoryPath, prefixLen);
 391     }
 392     if (reader == null) {
 393       reader = openReader();
 394     }
 395     TermDocs docs = reader.termDocs(new Term(Consts.FULL, categoryPath
 396         .toString(delimiter, prefixLen)));
 397     if (!docs.next()) {
 398       return -1; // category does not exist in taxonomy
 399     }
 400     addToCache(categoryPath, prefixLen, docs.doc());
 401     return docs.doc();
 402   }
 403
 404   // TODO (Facet): addCategory() is synchronized. This means that if indexing is
 405   // multi-threaded, a new category that needs to be written to disk (and
 406   // potentially even trigger a lengthy merge) locks out other addCategory()
 407   // calls - even those which could immediately return a cached value.
 408   // We definitely need to fix this situation!
 409   public synchronized int addCategory(CategoryPath categoryPath)
 410   throws IOException {
 411     // If the category is already in the cache and/or the taxonomy, we
 412     // should return its existing ordinal:
 413     int res = findCategory(categoryPath);
 414     if (res < 0) {
 415       // This is a new category, and we need to insert it into the index
 416       // (and the cache). Actually, we might also need to add some of
 417       // the category's ancestors before we can add the category itself
 418       // (while keeping the invariant that a parent is always added to
 419       // the taxonomy before its child). internalAddCategory() does all
 420       // this recursively:
 421       res = internalAddCategory(categoryPath, categoryPath.length());
 422     }
 423     return res;
 424
 425   }
 426
 427   /**
 428    * Add a new category into the index (and the cache), and return its new
 429    * ordinal.
 430    * <P>
 431    * Actually, we might also need to add some of the category's ancestors
 432    * before we can add the category itself (while keeping the invariant that a
 433    * parent is always added to the taxonomy before its child). We do this by
 434    * recursion.
 435    */
 436   private int internalAddCategory(CategoryPath categoryPath, int length)
 437   throws CorruptIndexException, IOException {
 438
 439     // Find our parent's ordinal (recursively adding the parent category
 440     // to the taxonomy if it's not already there). Then add the parent
 441     // ordinal as payloads (rather than a stored field; payloads can be
 442     // more efficiently read into memory in bulk by LuceneTaxonomyReader)
 443     int parent;
 444     if (length > 1) {
 445       parent = findCategory(categoryPath, length - 1);
 446       if (parent < 0) {
 447         parent = internalAddCategory(categoryPath, length - 1);
 448       }
 449     } else if (length == 1) {
 450       parent = TaxonomyReader.ROOT_ORDINAL;
 451     } else {
 452       parent = TaxonomyReader.INVALID_ORDINAL;
 453     }
 454     int id = addCategoryDocument(categoryPath, length, parent);
 455
 456     return id;
 457   }
 458
 459   // Note that the methods calling addCategoryDocument() are synchornized,
 460   // so this method is effectively synchronized as well, but we'll add
 461   // synchronized to be on the safe side, and we can reuse class-local objects
 462   // instead of allocating them every time
 463   protected synchronized int addCategoryDocument(CategoryPath categoryPath,
 464                                                   int length, int parent)
 465       throws CorruptIndexException, IOException {
 466     // Before Lucene 2.9, position increments >=0 were supported, so we
 467     // added 1 to parent to allow the parent -1 (the parent of the root).
 468     // Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is
 469     // no longer enough, since 0 is not encoded consistently either (see
 470     // comment in SinglePositionTokenStream). But because we must be
 471     // backward-compatible with existing indexes, we can't just fix what
 472     // we write here (e.g., to write parent+2), and need to do a workaround
 473     // in the reader (which knows that anyway only category 0 has a parent
 474     // -1).
 475     parentStream.set(parent+1);
 476     Document d = new Document();
 477     d.add(parentStreamField);
 478
 479     fullPathField.setValue(categoryPath.toString(delimiter, length));
 480     d.add(fullPathField);
 481
 482     // Note that we do no pass an Analyzer here because the fields that are
 483     // added to the Document are untokenized or contains their own TokenStream.
 484     // Therefore the IndexWriter's Analyzer has no effect.
 485     indexWriter.addDocument(d);
 486     int id = nextID++;
 487
 488     addToCache(categoryPath, length, id);
 489
 490     // also add to the parent array
 491     getParentArray().add(id, parent);
 492
 493     return id;
 494   }
 495
 496   private static class SinglePositionTokenStream extends TokenStream {
 497     private CharTermAttribute termAtt;
 498     private PositionIncrementAttribute posIncrAtt;
 499     private boolean returned;
 500     public SinglePositionTokenStream(String word) {
 501       termAtt = addAttribute(CharTermAttribute.class);
 502       posIncrAtt = addAttribute(PositionIncrementAttribute.class);
 503       termAtt.setEmpty().append(word);
 504       returned = true;
 505     }
 506     /**
 507      * Set the value we want to keep, as the position increment.
 508      * Note that when TermPositions.nextPosition() is later used to
 509      * retrieve this value, val-1 will be returned, not val.
 510      * <P>
 511      * IMPORTANT NOTE: Before Lucene 2.9, val>=0 were safe (for val==0,
 512      * the retrieved position would be -1). But starting with Lucene 2.9,
 513      * this unfortunately changed, and only val>0 are safe. val=0 can
 514      * still be used, but don't count on the value you retrieve later
 515      * (it could be 0 or -1, depending on circumstances or versions).
 516      * This change is described in Lucene's JIRA: LUCENE-1542.
 517      */
 518     public void set(int val) {
 519       posIncrAtt.setPositionIncrement(val);
 520       returned = false;
 521     }
 522     @Override
 523     public boolean incrementToken() throws IOException {
 524       if (returned) {
 525         return false;
 526       }
 527       returned = true;
 528       return true;
 529     }
 530   }
 531
 532   private void addToCache(CategoryPath categoryPath, int id)
 533   throws CorruptIndexException, IOException {
 534     if (cache.put(categoryPath, id)) {
 535       // If cache.put() returned true, it means the cache was limited in
 536       // size, became full, so parts of it had to be cleared.
 537       // Unfortunately we don't know which part was cleared - it is
 538       // possible that a relatively-new category that hasn't yet been
 539       // committed to disk (and therefore isn't yet visible in our
 540       // "reader") was deleted from the cache, and therefore we must
 541       // now refresh the reader.
 542       // Because this is a slow operation, cache implementations are
 543       // expected not to delete entries one-by-one but rather in bulk
 544       // (LruTaxonomyWriterCache removes the 2/3rd oldest entries).
 545       refreshReader();
 546       cacheIsComplete = false;
 547     }
 548   }
 549
 550   private void addToCache(CategoryPath categoryPath, int prefixLen, int id)
 551   throws CorruptIndexException, IOException {
 552     if (cache.put(categoryPath, prefixLen, id)) {
 553       refreshReader();
 554       cacheIsComplete = false;
 555     }
 556   }
 557
 558   private synchronized void refreshReader() throws IOException {
 559     if (reader != null) {
 560       IndexReader r2 = reader.reopen();
 561       if (reader != r2) {
 562         reader.close();
 563         reader = r2;
 564       }
 565     }
 566   }
 567
 568   /**
 569    * Calling commit() ensures that all the categories written so far are
 570    * visible to a reader that is opened (or reopened) after that call.
 571    * When the index is closed(), commit() is also implicitly done.
 572    * See {@link TaxonomyWriter#commit()}
 573    */
 574   public synchronized void commit() throws CorruptIndexException, IOException {
 575     indexWriter.commit();
 576     refreshReader();
 577   }
 578
 579   /**
 580    * Like commit(), but also store properties with the index. These properties
 581    * are retrievable by {@link LuceneTaxonomyReader#getCommitUserData}.
 582    * See {@link TaxonomyWriter#commit(Map)}.
 583    */
 584   public synchronized void commit(Map<String,String> commitUserData) throws CorruptIndexException, IOException {
 585     indexWriter.commit(commitUserData);
 586     refreshReader();
 587   }
 588
 589   /**
 590    * prepare most of the work needed for a two-phase commit.
 591    * See {@link IndexWriter#prepareCommit}.
 592    */
 593   public synchronized void prepareCommit() throws CorruptIndexException, IOException {
 594     indexWriter.prepareCommit();
 595   }
 596
 597   /**
 598    * Like above, and also prepares to store user data with the index.
 599    * See {@link IndexWriter#prepareCommit(Map)}
 600    */
 601   public synchronized void prepareCommit(Map<String,String> commitUserData) throws CorruptIndexException, IOException {
 602     indexWriter.prepareCommit(commitUserData);
 603   }
 604
 605   /**
 606    * getSize() returns the number of categories in the taxonomy.
 607    * <P>
 608    * Because categories are numbered consecutively starting with 0, it means
 609    * the taxonomy contains ordinals 0 through getSize()-1.
 610    * <P>
 611    * Note that the number returned by getSize() is often slightly higher than
 612    * the number of categories inserted into the taxonomy; This is because when
 613    * a category is added to the taxonomy, its ancestors are also added
 614    * automatically (including the root, which always get ordinal 0).
 615    */
 616   synchronized public int getSize() {
 617     return indexWriter.maxDoc();
 618   }
 619
 620   private boolean alreadyCalledFillCache = false;
 621
 622   /**
 623    * Set the number of cache misses before an attempt is made to read the
 624    * entire taxonomy into the in-memory cache.
 625    * <P>
 626    * LuceneTaxonomyWriter holds an in-memory cache of recently seen
 627    * categories to speed up operation. On each cache-miss, the on-disk index
 628    * needs to be consulted. When an existing taxonomy is opened, a lot of
 629    * slow disk reads like that are needed until the cache is filled, so it
 630    * is more efficient to read the entire taxonomy into memory at once.
 631    * We do this complete read after a certain number (defined by this method)
 632    * of cache misses.
 633    * <P>
 634    * If the number is set to <CODE>0</CODE>, the entire taxonomy is read
 635    * into the cache on first use, without fetching individual categories
 636    * first.
 637    * <P>
 638    * Note that if the memory cache of choice is limited in size, and cannot
 639    * hold the entire content of the on-disk taxonomy, then it is never
 640    * read in its entirety into the cache, regardless of the setting of this
 641    * method.
 642    */
 643   public void setCacheMissesUntilFill(int i) {
 644     cacheMissesUntilFill = i;
 645   }
 646   private int cacheMissesUntilFill = 11;
 647
 648   private boolean perhapsFillCache() throws IOException {
 649     // Note: we assume that we're only called when cacheIsComplete==false.
 650     // TODO (Facet): parametrize this criterion:
 651     if (cacheMisses < cacheMissesUntilFill) {
 652       return false;
 653     }
 654     // If the cache was already filled (or we decided not to fill it because
 655     // there was no room), there is no sense in trying it again.
 656     if (alreadyCalledFillCache) {
 657       return false;
 658     }
 659     alreadyCalledFillCache = true;
 660     // TODO (Facet): we should probably completely clear the cache before starting
 661     // to read it?
 662     if (reader == null) {
 663       reader = openReader();
 664     }
 665
 666     if (!cache.hasRoom(reader.numDocs())) {
 667       return false;
 668     }
 669
 670     CategoryPath cp = new CategoryPath();
 671     TermDocs td = reader.termDocs();
 672     Term fullPathTerm = new Term(Consts.FULL);
 673     String field = fullPathTerm.field(); // needed so we can later use !=
 674     TermEnum terms = reader.terms(fullPathTerm);
 675     // The check is done here to avoid checking it on every iteration of the
 676     // below loop. A null term wlil be returned if there are no terms in the
 677     // lexicon, or after the Consts.FULL term. However while the loop is
 678     // executed we're safe, because we only iterate as long as there are next()
 679     // terms.
 680     if (terms.term() != null) {
 681       do {
 682         Term t = terms.term();
 683         if (t.field() != field) break;
 684         // Since we guarantee uniqueness of categories, each term has exactly
 685         // one document. Also, since we do not allow removing categories (and
 686         // hence documents), there are no deletions in the index. Therefore, it
 687         // is sufficient to call next(), and then doc(), exactly once with no
 688         // 'validation' checks.
 689         td.seek(t);
 690         td.next();
 691         cp.clear();
 692         cp.add(t.text(), delimiter);
 693         cache.put(cp, td.doc());
 694       } while (terms.next());
 695     }
 696
 697     cacheIsComplete = true;
 698     // No sense to keep the reader open - we will not need to read from it
 699     // if everything is in the cache.
 700     reader.close();
 701     reader = null;
 702     return true;
 703   }
 704
 705   // TODO (Facet): synchronization of some sort?
 706   private ParentArray parentArray;
 707   private ParentArray getParentArray() throws IOException {
 708     if (parentArray==null) {
 709       if (reader == null) {
 710         reader = openReader();
 711       }
 712       parentArray = new ParentArray();
 713       parentArray.refresh(reader);
 714     }
 715     return parentArray;
 716   }
 717   public int getParent(int ordinal) throws IOException {
 718     // Note: the following if() just enforces that a user can never ask
 719     // for the parent of a nonexistant category - even if the parent array
 720     // was allocated bigger than it really needs to be.
 721     if (ordinal >= getSize()) {
 722       throw new ArrayIndexOutOfBoundsException();
 723     }
 724     return getParentArray().getArray()[ordinal];
 725   }
 726
 727   /**
 728    * Take all the categories of one or more given taxonomies, and add them to
 729    * the main taxonomy (this), if they are not already there.
 730    * <P>
 731    * Additionally, fill a <I>mapping</I> for each of the added taxonomies,
 732    * mapping its ordinals to the ordinals in the enlarged main taxonomy.
 733    * These mapping are saved into an array of OrdinalMap objects given by the
 734    * user, one for each of the given taxonomies (not including "this", the main
 735    * taxonomy). Often the first of these will be a MemoryOrdinalMap and the
 736    * others will be a DiskOrdinalMap - see discussion in {OrdinalMap}.
 737    * <P>
 738    * Note that the taxonomies to be added are given as Directory objects,
 739    * not opened TaxonomyReader/TaxonomyWriter objects, so if any of them are
 740    * currently managed by an open TaxonomyWriter, make sure to commit() (or
 741    * close()) it first. The main taxonomy (this) is an open TaxonomyWriter,
 742    * and does not need to be commit()ed before this call.
 743    */
 744   public void addTaxonomies(Directory[] taxonomies, OrdinalMap[] ordinalMaps) throws IOException {
 745     // To prevent us stepping on the rest of this class's decisions on when
 746     // to open a reader, and when not, we'll be opening a new reader instead
 747     // of using the existing "reader" object:
 748     IndexReader mainreader = openReader();
 749     TermEnum mainte = mainreader.terms(new Term(Consts.FULL));
 750
 751     IndexReader[] otherreaders = new IndexReader[taxonomies.length];
 752     TermEnum[] othertes = new TermEnum[taxonomies.length];
 753     for (int i=0; i<taxonomies.length; i++) {
 754       otherreaders[i] = IndexReader.open(taxonomies[i]);
 755       othertes[i] = otherreaders[i].terms(new Term(Consts.FULL));
 756       // Also tell the ordinal maps their expected sizes:
 757       ordinalMaps[i].setSize(otherreaders[i].numDocs());
 758     }
 759
 760     CategoryPath cp = new CategoryPath();
 761
 762     // We keep a "current" cursor over the alphabetically-ordered list of
 763     // categories in each taxonomy. We start the cursor on the first
 764     // (alphabetically) category of each taxonomy:
 765
 766     String currentMain;
 767     String[] currentOthers = new String[taxonomies.length];
 768     currentMain = nextTE(mainte);
 769     int otherTaxonomiesLeft = 0;
 770     for (int i=0; i<taxonomies.length; i++) {
 771       currentOthers[i] = nextTE(othertes[i]);
 772       if (currentOthers[i]!=null) {
 773         otherTaxonomiesLeft++;
 774       }
 775     }
 776
 777     // And then, at each step look at the first (alphabetically) of the
 778     // current taxonomies.
 779     // NOTE: The most efficient way we could have done this is using a
 780     // PriorityQueue. But for simplicity, and assuming that usually we'll
 781     // have a very small number of other taxonomies (often just 1), we use
 782     // a more naive algorithm (o(ntaxonomies) instead of o(ln ntaxonomies)
 783     // per step)
 784
 785     while (otherTaxonomiesLeft>0) {
 786       String first=null;
 787       for (int i=0; i<taxonomies.length; i++) {
 788         if (currentOthers[i]==null) continue;
 789         if (first==null || first.compareTo(currentOthers[i])>0) {
 790           first = currentOthers[i];
 791         }
 792       }
 793       int comp = 0;
 794       if (currentMain==null || (comp = currentMain.compareTo(first))>0) {
 795         // If 'first' is before currentMain, or currentMain is null,
 796         // then 'first' is a new category and we need to add it to the
 797         // main taxonomy. Then for all taxonomies with this 'first'
 798         // category, we need to add the new category number to their
 799         // map, and move to the next category in all of them.
 800         cp.clear();
 801         cp.add(first, delimiter);
 802         // We can call internalAddCategory() instead of addCategory()
 803         // because we know the category hasn't been seen yet.
 804         int newordinal = internalAddCategory(cp, cp.length());
 805         // TODO (Facet): we already had this term in our hands before, in nextTE...
 806         Term t = new Term(Consts.FULL, first);
 807         for (int i=0; i<taxonomies.length; i++) {
 808           if (first.equals(currentOthers[i])) {
 809             // remember the remapping of this ordinal. Note how
 810             // this requires reading a posting list from the index -
 811             // but since we do this in lexical order of terms, just
 812             // like Lucene's merge works, we hope there are few seeks.
 813             // TODO (Facet): is there a quicker way? E.g., not specifying the
 814             // next term by name every time?
 815             TermDocs td = otherreaders[i].termDocs(t);
 816             td.next(); // TODO (Facet): check?
 817             int origordinal = td.doc();
 818             ordinalMaps[i].addMapping(origordinal, newordinal);
 819             // and move to the next category in the i'th taxonomy
 820             currentOthers[i] = nextTE(othertes[i]);
 821             if (currentOthers[i]==null) {
 822               otherTaxonomiesLeft--;
 823             }
 824           }
 825         }
 826       } else if (comp==0) {
 827         // 'first' and currentMain are the same, so both the main and some
 828         // other taxonomies need to be moved, but a category doesn't need
 829         // to be added because it already existed in the main taxonomy.
 830
 831         // TODO (Facet): Again, is there a quicker way?
 832         Term t = new Term(Consts.FULL, first);
 833         TermDocs td = mainreader.termDocs(t);
 834         td.next(); // TODO (Facet): check?
 835         int newordinal = td.doc();
 836
 837         currentMain = nextTE(mainte);
 838         for (int i=0; i<taxonomies.length; i++) {
 839           if (first.equals(currentOthers[i])) {
 840             // TODO (Facet): again, is there a quicker way?
 841             td = otherreaders[i].termDocs(t);
 842             td.next(); // TODO (Facet): check?
 843             int origordinal = td.doc();
 844             ordinalMaps[i].addMapping(origordinal, newordinal);
 845
 846             // and move to the next category
 847             currentOthers[i] = nextTE(othertes[i]);
 848             if (currentOthers[i]==null) {
 849               otherTaxonomiesLeft--;
 850             }
 851           }
 852         }
 853       } else /* comp > 0 */ {
 854         // The currentMain doesn't appear in any of the other taxonomies -
 855         // we don't need to do anything, just continue to the next one
 856         currentMain = nextTE(mainte);
 857       }
 858     }
 859
 860     // Close all the readers we've opened, and also tell the ordinal maps
 861     // we're done adding to them
 862     mainreader.close();
 863     for (int i=0; i<taxonomies.length; i++) {
 864       otherreaders[i].close();
 865       // We never actually added a mapping for the root ordinal - let's do
 866       // it now, just so that the map is complete (every ordinal between 0
 867       // and size-1 is remapped)
 868       ordinalMaps[i].addMapping(0, 0);
 869       ordinalMaps[i].addDone();
 870     }
 871   }
 872
 873   /**
 874    * Mapping from old ordinal to new ordinals, used when merging indexes
 875    * wit separate taxonomies.
 876    * <p>
 877    * addToTaxonomies() merges one or more taxonomies into the given taxonomy
 878    * (this). An OrdinalMap is filled for each of the added taxonomies,
 879    * containing the new ordinal (in the merged taxonomy) of each of the
 880    * categories in the old taxonomy.
 881    * <P>
 882    * There exist two implementations of OrdinalMap: MemoryOrdinalMap and
 883    * DiskOrdinalMap. As their names suggest, the former keeps the map in
 884    * memory and the latter in a temporary disk file. Because these maps will
 885    * later be needed one by one (to remap the counting lists), not all at the
 886    * same time, it is recommended to put the first taxonomy's map in memory,
 887    * and all the rest on disk (later to be automatically read into memory one
 888    * by one, when needed).
 889    */
 890   public static interface OrdinalMap {
 891     /**
 892      * Set the size of the map. This MUST be called before addMapping().
 893      * It is assumed (but not verified) that addMapping() will then be
 894      * called exactly 'size' times, with different origOrdinals between 0
 895      * and size-1.
 896      */
 897     public void setSize(int size) throws IOException;
 898     public void addMapping(int origOrdinal, int newOrdinal) throws IOException;
 899     /**
 900      * Call addDone() to say that all addMapping() have been done.
 901      * In some implementations this might free some resources.
 902      */
 903     public void addDone() throws IOException;
 904     /**
 905      * Return the map from the taxonomy's original (consecutive) ordinals
 906      * to the new taxonomy's ordinals. If the map has to be read from disk
 907      * and ordered appropriately, it is done when getMap() is called.
 908      * getMap() should only be called once, and only when the map is actually
 909      * needed. Calling it will also free all resources that the map might
 910      * be holding (such as temporary disk space), other than the returned int[].
 911      */
 912     public int[] getMap() throws IOException;
 913   }
 914
 915   /**
 916    * {@link OrdinalMap} maintained in memory
 917    */
 918   public static final class MemoryOrdinalMap implements OrdinalMap {
 919     int[] map;
 920     public void setSize(int taxonomySize) {
 921       map = new int[taxonomySize];
 922     }
 923     public void addMapping(int origOrdinal, int newOrdinal) {
 924       map[origOrdinal] = newOrdinal;
 925     }
 926     public void addDone() { /* nothing to do */ }
 927     public int[] getMap() {
 928       return map;
 929     }
 930   }
 931
 932   /**
 933    * {@link OrdinalMap} maintained on file system
 934    */
 935   public static final class DiskOrdinalMap implements OrdinalMap {
 936     File tmpfile;
 937     DataOutputStream out;
 938
 939     public DiskOrdinalMap(File tmpfile) throws FileNotFoundException {
 940       this.tmpfile = tmpfile;
 941       out = new DataOutputStream(new BufferedOutputStream(
 942           new FileOutputStream(tmpfile)));
 943     }
 944
 945     public void addMapping(int origOrdinal, int newOrdinal) throws IOException {
 946       out.writeInt(origOrdinal);
 947       out.writeInt(newOrdinal);
 948     }
 949
 950     public void setSize(int taxonomySize) throws IOException {
 951       out.writeInt(taxonomySize);
 952     }
 953
 954     public void addDone() throws IOException {
 955       if (out!=null) {
 956         out.close();
 957         out = null;
 958       }
 959     }
 960
 961     int[] map = null;
 962
 963     public int[] getMap() throws IOException {
 964       if (map!=null) {
 965         return map;
 966       }
 967       addDone(); // in case this wasn't previously called
 968       DataInputStream in = new DataInputStream(new BufferedInputStream(
 969           new FileInputStream(tmpfile)));
 970       map = new int[in.readInt()];
 971       // NOTE: The current code assumes here that the map is complete,
 972       // i.e., every ordinal gets one and exactly one value. Otherwise,
 973       // we may run into an EOF here, or vice versa, not read everything.
 974       for (int i=0; i<map.length; i++) {
 975         int origordinal = in.readInt();
 976         int newordinal = in.readInt();
 977         map[origordinal] = newordinal;
 978       }
 979       in.close();
 980       // Delete the temporary file, which is no longer needed.
 981       if (!tmpfile.delete()) {
 982         tmpfile.deleteOnExit();
 983       }
 984       return map;
 985     }
 986   }
 987
 988   private static final String nextTE(TermEnum te) throws IOException {
 989     if (te.next()) {
 990       Term t = te.term();
 991       // If our enumeration reached a different field, we're done. Note
 992       // how we're allowed compare string references, rather than the
 993       // actual string's contents.
 994       if (t.field()==Consts.FULL) {
 995         return t.text();
 996       }
 997       return null;
 998     }
 999     return null;
1000   }
1001
1002 }