lucene-java-3.5.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java

   1 package org.apache.lucene.facet.taxonomy.directory;
   2
   3 import java.io.BufferedInputStream;
   4 import java.io.BufferedOutputStream;
   5 import java.io.DataInputStream;
   6 import java.io.DataOutputStream;
   7 import java.io.File;
   8 import java.io.FileInputStream;
   9 import java.io.FileNotFoundException;
  10 import java.io.FileOutputStream;
  11 import java.io.IOException;
  12 import java.util.HashMap;
  13 import java.util.Map;
  14
  15 import org.apache.lucene.analysis.KeywordAnalyzer;
  16 import org.apache.lucene.analysis.TokenStream;
  17 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  18 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  19 import org.apache.lucene.document.Document;
  20 import org.apache.lucene.document.Field;
  21 import org.apache.lucene.document.Field.Index;
  22 import org.apache.lucene.document.Field.Store;
  23 import org.apache.lucene.index.CorruptIndexException;
  24 import org.apache.lucene.index.FieldInfo.IndexOptions;
  25 import org.apache.lucene.index.IndexReader;
  26 import org.apache.lucene.index.IndexWriter;
  27 import org.apache.lucene.index.IndexWriterConfig;
  28 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
  29 import org.apache.lucene.index.LogByteSizeMergePolicy;
  30 import org.apache.lucene.index.Term;
  31 import org.apache.lucene.index.TermDocs;
  32 import org.apache.lucene.index.TermEnum;
  33 import org.apache.lucene.store.AlreadyClosedException;
  34 import org.apache.lucene.store.Directory;
  35 import org.apache.lucene.store.LockObtainFailedException;
  36 import org.apache.lucene.store.NativeFSLockFactory;
  37 import org.apache.lucene.store.SimpleFSLockFactory;
  38 import org.apache.lucene.util.Version;
  39
  40 import org.apache.lucene.facet.taxonomy.CategoryPath;
  41 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
  42 import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
  43 import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
  44 import org.apache.lucene.facet.taxonomy.writercache.cl2o.Cl2oTaxonomyWriterCache;
  45 import org.apache.lucene.facet.taxonomy.writercache.lru.LruTaxonomyWriterCache;
  46
  47 /**
  48  * Licensed to the Apache Software Foundation (ASF) under one or more
  49  * contributor license agreements.  See the NOTICE file distributed with
  50  * this work for additional information regarding copyright ownership.
  51  * The ASF licenses this file to You under the Apache License, Version 2.0
  52  * (the "License"); you may not use this file except in compliance with
  53  * the License.  You may obtain a copy of the License at
  54  *
  55  *     http://www.apache.org/licenses/LICENSE-2.0
  56  *
  57  * Unless required by applicable law or agreed to in writing, software
  58  * distributed under the License is distributed on an "AS IS" BASIS,
  59  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  60  * See the License for the specific language governing permissions and
  61  * limitations under the License.
  62  */
  63
  64 /**
  65  * {@link TaxonomyWriter} which uses a {@link Directory} to store the taxonomy
  66  * information on disk, and keeps an additional in-memory cache of some or all
  67  * categories.
  68  * <p>
  69  * In addition to the permanently-stored information in the {@link Directory},
  70  * efficiency dictates that we also keep an in-memory cache of <B>recently
  71  * seen</B> or <B>all</B> categories, so that we do not need to go back to disk
  72  * for every category addition to see which ordinal this category already has,
  73  * if any. A {@link TaxonomyWriterCache} object determines the specific caching
  74  * algorithm used.
  75  * <p>
  76  * This class offers some hooks for extending classes to control the
  77  * {@link IndexWriter} instance that is used. See {@link #openIndexWriter}.
  78  *
  79  * @lucene.experimental
  80  */
  81 public class DirectoryTaxonomyWriter implements TaxonomyWriter {
  82
  83   /**
  84    * Property name of user commit data that contains the creation time of a taxonomy index.
  85    * <p>
  86    * Applications making use of {@link TaxonomyWriter#commit(Map)} should not use this
  87    * particular property name.
  88    */
  89   public static final String INDEX_CREATE_TIME = "index.create.time";
  90
  91   private IndexWriter indexWriter;
  92   private int nextID;
  93   private char delimiter = Consts.DEFAULT_DELIMITER;
  94   private SinglePositionTokenStream parentStream = new SinglePositionTokenStream(Consts.PAYLOAD_PARENT);
  95   private Field parentStreamField;
  96   private Field fullPathField;
  97
  98   private TaxonomyWriterCache cache;
  99   /**
 100    * We call the cache "complete" if we know that every category in our
 101    * taxonomy is in the cache. When the cache is <B>not</B> complete, and
 102    * we can't find a category in the cache, we still need to look for it
 103    * in the on-disk index; Therefore when the cache is not complete, we
 104    * need to open a "reader" to the taxonomy index.
 105    * The cache becomes incomplete if it was never filled with the existing
 106    * categories, or if a put() to the cache ever returned true (meaning
 107    * that some of the cached data was cleared).
 108    */
 109   private boolean cacheIsComplete;
 110   private IndexReader reader;
 111   private int cacheMisses;
 112
 113   /**
 114    * When a taxonomy is created, we mark that its create time should be committed in the
 115    * next commit.
 116    */
 117   private String taxoIndexCreateTime = null;
 118
 119   /**
 120    * setDelimiter changes the character that the taxonomy uses in its internal
 121    * storage as a delimiter between category components. Do not use this
 122    * method unless you really know what you are doing. It has nothing to do
 123    * with whatever character the application may be using to represent
 124    * categories for its own use.
 125    * <P>
 126    * If you do use this method, make sure you call it before any other methods
 127    * that actually queries the taxonomy. Moreover, make sure you always pass
 128    * the same delimiter for all LuceneTaxonomyWriter and LuceneTaxonomyReader
 129    * objects you create for the same directory.
 130    */
 131   public void setDelimiter(char delimiter) {
 132     ensureOpen();
 133     this.delimiter = delimiter;
 134   }
 135
 136   /**
 137    * Forcibly unlocks the taxonomy in the named directory.
 138    * <P>
 139    * Caution: this should only be used by failure recovery code, when it is
 140    * known that no other process nor thread is in fact currently accessing
 141    * this taxonomy.
 142    * <P>
 143    * This method is unnecessary if your {@link Directory} uses a
 144    * {@link NativeFSLockFactory} instead of the default
 145    * {@link SimpleFSLockFactory}. When the "native" lock is used, a lock
 146    * does not stay behind forever when the process using it dies.
 147    */
 148   public static void unlock(Directory directory) throws IOException {
 149     IndexWriter.unlock(directory);
 150   }
 151
 152   /**
 153    * Construct a Taxonomy writer.
 154    *
 155    * @param directory
 156    *    The {@link Directory} in which to store the taxonomy. Note that
 157    *    the taxonomy is written directly to that directory (not to a
 158    *    subdirectory of it).
 159    * @param openMode
 160    *    Specifies how to open a taxonomy for writing: <code>APPEND</code>
 161    *    means open an existing index for append (failing if the index does
 162    *    not yet exist). <code>CREATE</code> means create a new index (first
 163    *    deleting the old one if it already existed).
 164    *    <code>APPEND_OR_CREATE</code> appends to an existing index if there
 165    *    is one, otherwise it creates a new index.
 166    * @param cache
 167    *    A {@link TaxonomyWriterCache} implementation which determines
 168    *    the in-memory caching policy. See for example
 169    *    {@link LruTaxonomyWriterCache} and {@link Cl2oTaxonomyWriterCache}.
 170    *    If null or missing, {@link #defaultTaxonomyWriterCache()} is used.
 171    * @throws CorruptIndexException
 172    *     if the taxonomy is corrupted.
 173    * @throws LockObtainFailedException
 174    *     if the taxonomy is locked by another writer. If it is known
 175    *     that no other concurrent writer is active, the lock might
 176    *     have been left around by an old dead process, and should be
 177    *     removed using {@link #unlock(Directory)}.
 178    * @throws IOException
 179    *     if another error occurred.
 180    */
 181   public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode,
 182                               TaxonomyWriterCache cache)
 183   throws CorruptIndexException, LockObtainFailedException,
 184   IOException {
 185
 186     if (!IndexReader.indexExists(directory) || openMode==OpenMode.CREATE) {
 187       taxoIndexCreateTime = Long.toString(System.nanoTime());
 188     }
 189
 190     indexWriter = openIndexWriter(directory, openMode);
 191     reader = null;
 192
 193     parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream);
 194     parentStreamField.setOmitNorms(true);
 195     fullPathField = new Field(Consts.FULL, "", Store.YES, Index.NOT_ANALYZED_NO_NORMS);
 196     fullPathField.setIndexOptions(IndexOptions.DOCS_ONLY);
 197
 198     this.nextID = indexWriter.maxDoc();
 199
 200     if (cache==null) {
 201       cache = defaultTaxonomyWriterCache();
 202     }
 203     this.cache = cache;
 204
 205     if (nextID == 0) {
 206       cacheIsComplete = true;
 207       // Make sure that the taxonomy always contain the root category
 208       // with category id 0.
 209       addCategory(new CategoryPath());
 210       refreshReader();
 211     } else {
 212       // There are some categories on the disk, which we have not yet
 213       // read into the cache, and therefore the cache is incomplete.
 214       // We chose not to read all the categories into the cache now,
 215       // to avoid terrible performance when a taxonomy index is opened
 216       // to add just a single category. We will do it later, after we
 217       // notice a few cache misses.
 218       cacheIsComplete = false;
 219     }
 220     cacheMisses = 0;
 221   }
 222
 223   /**
 224    * A hook for extensions of this class to provide their own
 225    * {@link IndexWriter} implementation or instance. Extending classes can
 226    * instantiate and configure the {@link IndexWriter} as they see fit,
 227    * including setting a {@link org.apache.lucene.index.MergeScheduler}, or
 228    * {@link org.apache.lucene.index.IndexDeletionPolicy}, different RAM size
 229    * etc.<br>
 230    * <b>NOTE:</b> the instance this method returns will be closed upon calling
 231    * to {@link #close()}.
 232    *
 233    * @param directory
 234    *          the {@link Directory} on top of which an {@link IndexWriter}
 235    *          should be opened.
 236    * @param openMode
 237    *          see {@link OpenMode}
 238    */
 239   protected IndexWriter openIndexWriter(Directory directory, OpenMode openMode)
 240       throws IOException {
 241     // Make sure we use a MergePolicy which merges segments in-order and thus
 242     // keeps the doc IDs ordered as well (this is crucial for the taxonomy
 243     // index).
 244     IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_30,
 245         new KeywordAnalyzer()).setOpenMode(openMode).setMergePolicy(
 246         new LogByteSizeMergePolicy());
 247     return new IndexWriter(directory, config);
 248   }
 249
 250   // Currently overridden by a unit test that verifies that every index we open
 251   // is close()ed.
 252   /**
 253    * Open an {@link IndexReader} from the {@link #indexWriter} member, by
 254    * calling {@link IndexWriter#getReader()}. Extending classes can override
 255    * this method to return their own {@link IndexReader}.
 256    */
 257   protected IndexReader openReader() throws IOException {
 258     return IndexReader.open(indexWriter, true);
 259   }
 260
 261   /**
 262    * Creates a new instance with a default cached as defined by
 263    * {@link #defaultTaxonomyWriterCache()}.
 264    */
 265   public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode)
 266   throws CorruptIndexException, LockObtainFailedException, IOException {
 267     this(directory, openMode, defaultTaxonomyWriterCache());
 268   }
 269
 270   /**
 271    * Defines the default {@link TaxonomyWriterCache} to use in constructors
 272    * which do not specify one.
 273    * <P>
 274    * The current default is {@link Cl2oTaxonomyWriterCache} constructed
 275    * with the parameters (1024, 0.15f, 3), i.e., the entire taxonomy is
 276    * cached in memory while building it.
 277    */
 278   public static TaxonomyWriterCache defaultTaxonomyWriterCache() {
 279     return new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
 280   }
 281
 282   // convenience constructors:
 283
 284   public DirectoryTaxonomyWriter(Directory d)
 285   throws CorruptIndexException, LockObtainFailedException,
 286   IOException {
 287     this(d, OpenMode.CREATE_OR_APPEND);
 288   }
 289
 290   /**
 291    * Frees used resources as well as closes the underlying {@link IndexWriter},
 292    * which commits whatever changes made to it to the underlying
 293    * {@link Directory}.
 294    */
 295   public synchronized void close() throws CorruptIndexException, IOException {
 296     if (indexWriter != null) {
 297       if (taxoIndexCreateTime != null) {
 298         indexWriter.commit(combinedCommitData(null));
 299         taxoIndexCreateTime = null;
 300       }
 301       doClose();
 302     }
 303   }
 304
 305   private void doClose() throws CorruptIndexException, IOException {
 306     indexWriter.close();
 307     indexWriter = null;
 308     closeResources();
 309   }
 310
 311   /**
 312    * Returns the number of memory bytes used by the cache.
 313    * @return Number of cache bytes in memory, for CL2O only; zero otherwise.
 314    */
 315   public int getCacheMemoryUsage() {
 316     ensureOpen();
 317     if (this.cache == null || !(this.cache instanceof Cl2oTaxonomyWriterCache)) {
 318       return 0;
 319     }
 320     return ((Cl2oTaxonomyWriterCache)this.cache).getMemoryUsage();
 321   }
 322
 323   /**
 324    * A hook for extending classes to close additional resources that were used.
 325    * The default implementation closes the {@link IndexReader} as well as the
 326    * {@link TaxonomyWriterCache} instances that were used. <br>
 327    * <b>NOTE:</b> if you override this method, you should include a
 328    * <code>super.closeResources()</code> call in your implementation.
 329    */
 330   protected synchronized void closeResources() throws IOException {
 331     if (reader != null) {
 332       reader.close();
 333       reader = null;
 334     }
 335     if (cache != null) {
 336       cache.close();
 337       cache = null;
 338     }
 339   }
 340
 341   /**
 342    * Look up the given category in the cache and/or the on-disk storage,
 343    * returning the category's ordinal, or a negative number in case the
 344    * category does not yet exist in the taxonomy.
 345    */
 346   protected int findCategory(CategoryPath categoryPath) throws IOException {
 347     // If we can find the category in our cache, we can return the
 348     // response directly from it:
 349     int res = cache.get(categoryPath);
 350     if (res >= 0) {
 351       return res;
 352     }
 353     // If we know that the cache is complete, i.e., contains every category
 354     // which exists, we can return -1 immediately. However, if the cache is
 355     // not complete, we need to check the disk.
 356     if (cacheIsComplete) {
 357       return -1;
 358     }
 359     cacheMisses++;
 360     // After a few cache misses, it makes sense to read all the categories
 361     // from disk and into the cache. The reason not to do this on the first
 362     // cache miss (or even when opening the writer) is that it will
 363     // significantly slow down the case when a taxonomy is opened just to
 364     // add one category. The idea only spending a long time on reading
 365     // after enough time was spent on cache misses is known as a "online
 366     // algorithm".
 367     if (perhapsFillCache()) {
 368       return cache.get(categoryPath);
 369     }
 370
 371     // We need to get an answer from the on-disk index. If a reader
 372     // is not yet open, do it now:
 373     if (reader == null) {
 374       reader = openReader();
 375     }
 376
 377     TermDocs docs = reader.termDocs(new Term(Consts.FULL, categoryPath
 378         .toString(delimiter)));
 379     if (!docs.next()) {
 380       return -1; // category does not exist in taxonomy
 381     }
 382     // Note: we do NOT add to the cache the fact that the category
 383     // does not exist. The reason is that our only use for this
 384     // method is just before we actually add this category. If
 385     // in the future this usage changes, we should consider caching
 386     // the fact that the category is not in the taxonomy.
 387     addToCache(categoryPath, docs.doc());
 388     return docs.doc();
 389   }
 390
 391   /**
 392    * Look up the given prefix of the given category in the cache and/or the
 393    * on-disk storage, returning that prefix's ordinal, or a negative number in
 394    * case the category does not yet exist in the taxonomy.
 395    */
 396   private int findCategory(CategoryPath categoryPath, int prefixLen)
 397   throws IOException {
 398     int res = cache.get(categoryPath, prefixLen);
 399     if (res >= 0) {
 400       return res;
 401     }
 402     if (cacheIsComplete) {
 403       return -1;
 404     }
 405     cacheMisses++;
 406     if (perhapsFillCache()) {
 407       return cache.get(categoryPath, prefixLen);
 408     }
 409     if (reader == null) {
 410       reader = openReader();
 411     }
 412     TermDocs docs = reader.termDocs(new Term(Consts.FULL, categoryPath
 413         .toString(delimiter, prefixLen)));
 414     if (!docs.next()) {
 415       return -1; // category does not exist in taxonomy
 416     }
 417     addToCache(categoryPath, prefixLen, docs.doc());
 418     return docs.doc();
 419   }
 420
 421   // TODO (Facet): addCategory() is synchronized. This means that if indexing is
 422   // multi-threaded, a new category that needs to be written to disk (and
 423   // potentially even trigger a lengthy merge) locks out other addCategory()
 424   // calls - even those which could immediately return a cached value.
 425   // We definitely need to fix this situation!
 426   public synchronized int addCategory(CategoryPath categoryPath) throws IOException {
 427     ensureOpen();
 428     // If the category is already in the cache and/or the taxonomy, we
 429     // should return its existing ordinal:
 430     int res = findCategory(categoryPath);
 431     if (res < 0) {
 432       // This is a new category, and we need to insert it into the index
 433       // (and the cache). Actually, we might also need to add some of
 434       // the category's ancestors before we can add the category itself
 435       // (while keeping the invariant that a parent is always added to
 436       // the taxonomy before its child). internalAddCategory() does all
 437       // this recursively:
 438       res = internalAddCategory(categoryPath, categoryPath.length());
 439     }
 440     return res;
 441
 442   }
 443
 444   /**
 445    * Add a new category into the index (and the cache), and return its new
 446    * ordinal.
 447    * <P>
 448    * Actually, we might also need to add some of the category's ancestors
 449    * before we can add the category itself (while keeping the invariant that a
 450    * parent is always added to the taxonomy before its child). We do this by
 451    * recursion.
 452    */
 453   private int internalAddCategory(CategoryPath categoryPath, int length)
 454   throws CorruptIndexException, IOException {
 455
 456     // Find our parent's ordinal (recursively adding the parent category
 457     // to the taxonomy if it's not already there). Then add the parent
 458     // ordinal as payloads (rather than a stored field; payloads can be
 459     // more efficiently read into memory in bulk by LuceneTaxonomyReader)
 460     int parent;
 461     if (length > 1) {
 462       parent = findCategory(categoryPath, length - 1);
 463       if (parent < 0) {
 464         parent = internalAddCategory(categoryPath, length - 1);
 465       }
 466     } else if (length == 1) {
 467       parent = TaxonomyReader.ROOT_ORDINAL;
 468     } else {
 469       parent = TaxonomyReader.INVALID_ORDINAL;
 470     }
 471     int id = addCategoryDocument(categoryPath, length, parent);
 472
 473     return id;
 474   }
 475
 476   /**
 477    * Verifies that this instance wasn't closed, or throws
 478    * {@link AlreadyClosedException} if it is.
 479    */
 480   protected final void ensureOpen() {
 481     if (indexWriter == null) {
 482       throw new AlreadyClosedException("The taxonomy writer has already been closed");
 483     }
 484   }
 485
 486   // Note that the methods calling addCategoryDocument() are synchornized,
 487   // so this method is effectively synchronized as well, but we'll add
 488   // synchronized to be on the safe side, and we can reuse class-local objects
 489   // instead of allocating them every time
 490   protected synchronized int addCategoryDocument(CategoryPath categoryPath,
 491                                                   int length, int parent)
 492       throws CorruptIndexException, IOException {
 493     // Before Lucene 2.9, position increments >=0 were supported, so we
 494     // added 1 to parent to allow the parent -1 (the parent of the root).
 495     // Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is
 496     // no longer enough, since 0 is not encoded consistently either (see
 497     // comment in SinglePositionTokenStream). But because we must be
 498     // backward-compatible with existing indexes, we can't just fix what
 499     // we write here (e.g., to write parent+2), and need to do a workaround
 500     // in the reader (which knows that anyway only category 0 has a parent
 501     // -1).
 502     parentStream.set(parent+1);
 503     Document d = new Document();
 504     d.add(parentStreamField);
 505
 506     fullPathField.setValue(categoryPath.toString(delimiter, length));
 507     d.add(fullPathField);
 508
 509     // Note that we do no pass an Analyzer here because the fields that are
 510     // added to the Document are untokenized or contains their own TokenStream.
 511     // Therefore the IndexWriter's Analyzer has no effect.
 512     indexWriter.addDocument(d);
 513     int id = nextID++;
 514
 515     addToCache(categoryPath, length, id);
 516
 517     // also add to the parent array
 518     getParentArray().add(id, parent);
 519
 520     return id;
 521   }
 522
 523   private static class SinglePositionTokenStream extends TokenStream {
 524     private CharTermAttribute termAtt;
 525     private PositionIncrementAttribute posIncrAtt;
 526     private boolean returned;
 527     public SinglePositionTokenStream(String word) {
 528       termAtt = addAttribute(CharTermAttribute.class);
 529       posIncrAtt = addAttribute(PositionIncrementAttribute.class);
 530       termAtt.setEmpty().append(word);
 531       returned = true;
 532     }
 533     /**
 534      * Set the value we want to keep, as the position increment.
 535      * Note that when TermPositions.nextPosition() is later used to
 536      * retrieve this value, val-1 will be returned, not val.
 537      * <P>
 538      * IMPORTANT NOTE: Before Lucene 2.9, val>=0 were safe (for val==0,
 539      * the retrieved position would be -1). But starting with Lucene 2.9,
 540      * this unfortunately changed, and only val>0 are safe. val=0 can
 541      * still be used, but don't count on the value you retrieve later
 542      * (it could be 0 or -1, depending on circumstances or versions).
 543      * This change is described in Lucene's JIRA: LUCENE-1542.
 544      */
 545     public void set(int val) {
 546       posIncrAtt.setPositionIncrement(val);
 547       returned = false;
 548     }
 549     @Override
 550     public boolean incrementToken() throws IOException {
 551       if (returned) {
 552         return false;
 553       }
 554       returned = true;
 555       return true;
 556     }
 557   }
 558
 559   private void addToCache(CategoryPath categoryPath, int id)
 560   throws CorruptIndexException, IOException {
 561     if (cache.put(categoryPath, id)) {
 562       // If cache.put() returned true, it means the cache was limited in
 563       // size, became full, so parts of it had to be cleared.
 564       // Unfortunately we don't know which part was cleared - it is
 565       // possible that a relatively-new category that hasn't yet been
 566       // committed to disk (and therefore isn't yet visible in our
 567       // "reader") was deleted from the cache, and therefore we must
 568       // now refresh the reader.
 569       // Because this is a slow operation, cache implementations are
 570       // expected not to delete entries one-by-one but rather in bulk
 571       // (LruTaxonomyWriterCache removes the 2/3rd oldest entries).
 572       refreshReader();
 573       cacheIsComplete = false;
 574     }
 575   }
 576
 577   private void addToCache(CategoryPath categoryPath, int prefixLen, int id)
 578   throws CorruptIndexException, IOException {
 579     if (cache.put(categoryPath, prefixLen, id)) {
 580       refreshReader();
 581       cacheIsComplete = false;
 582     }
 583   }
 584
 585   private synchronized void refreshReader() throws IOException {
 586     if (reader != null) {
 587       IndexReader r2 = IndexReader.openIfChanged(reader);
 588       if (r2 != null) {
 589         reader.close();
 590         reader = r2;
 591       }
 592     }
 593   }
 594
 595   /**
 596    * Calling commit() ensures that all the categories written so far are
 597    * visible to a reader that is opened (or reopened) after that call.
 598    * When the index is closed(), commit() is also implicitly done.
 599    * See {@link TaxonomyWriter#commit()}
 600    */
 601   public synchronized void commit() throws CorruptIndexException, IOException {
 602     ensureOpen();
 603     if (taxoIndexCreateTime != null) {
 604       indexWriter.commit(combinedCommitData(null));
 605       taxoIndexCreateTime = null;
 606     } else {
 607       indexWriter.commit();
 608     }
 609     refreshReader();
 610   }
 611
 612   /**
 613    * Combine original user data with that of the taxonomy creation time
 614    */
 615   private Map<String,String> combinedCommitData(Map<String,String> userData) {
 616     Map<String,String> m = new HashMap<String, String>();
 617     if (userData != null) {
 618       m.putAll(userData);
 619     }
 620     m.put(INDEX_CREATE_TIME, taxoIndexCreateTime);
 621     return m;
 622   }
 623
 624   /**
 625    * Like commit(), but also store properties with the index. These properties
 626    * are retrievable by {@link DirectoryTaxonomyReader#getCommitUserData}.
 627    * See {@link TaxonomyWriter#commit(Map)}.
 628    */
 629   public synchronized void commit(Map<String,String> commitUserData) throws CorruptIndexException, IOException {
 630     ensureOpen();
 631     if (taxoIndexCreateTime != null) {
 632       indexWriter.commit(combinedCommitData(commitUserData));
 633       taxoIndexCreateTime = null;
 634     } else {
 635       indexWriter.commit(commitUserData);
 636     }
 637     refreshReader();
 638   }
 639
 640   /**
 641    * prepare most of the work needed for a two-phase commit.
 642    * See {@link IndexWriter#prepareCommit}.
 643    */
 644   public synchronized void prepareCommit() throws CorruptIndexException, IOException {
 645     ensureOpen();
 646     if (taxoIndexCreateTime != null) {
 647       indexWriter.prepareCommit(combinedCommitData(null));
 648       taxoIndexCreateTime = null;
 649     } else {
 650       indexWriter.prepareCommit();
 651     }
 652   }
 653
 654   /**
 655    * Like above, and also prepares to store user data with the index.
 656    * See {@link IndexWriter#prepareCommit(Map)}
 657    */
 658   public synchronized void prepareCommit(Map<String,String> commitUserData) throws CorruptIndexException, IOException {
 659     ensureOpen();
 660     if (taxoIndexCreateTime != null) {
 661       indexWriter.prepareCommit(combinedCommitData(commitUserData));
 662       taxoIndexCreateTime = null;
 663     } else {
 664       indexWriter.prepareCommit(commitUserData);
 665     }
 666   }
 667
 668   /**
 669    * getSize() returns the number of categories in the taxonomy.
 670    * <P>
 671    * Because categories are numbered consecutively starting with 0, it means
 672    * the taxonomy contains ordinals 0 through getSize()-1.
 673    * <P>
 674    * Note that the number returned by getSize() is often slightly higher than
 675    * the number of categories inserted into the taxonomy; This is because when
 676    * a category is added to the taxonomy, its ancestors are also added
 677    * automatically (including the root, which always get ordinal 0).
 678    */
 679   synchronized public int getSize() {
 680     ensureOpen();
 681     return indexWriter.maxDoc();
 682   }
 683
 684   private boolean alreadyCalledFillCache = false;
 685
 686   /**
 687    * Set the number of cache misses before an attempt is made to read the
 688    * entire taxonomy into the in-memory cache.
 689    * <P>
 690    * LuceneTaxonomyWriter holds an in-memory cache of recently seen
 691    * categories to speed up operation. On each cache-miss, the on-disk index
 692    * needs to be consulted. When an existing taxonomy is opened, a lot of
 693    * slow disk reads like that are needed until the cache is filled, so it
 694    * is more efficient to read the entire taxonomy into memory at once.
 695    * We do this complete read after a certain number (defined by this method)
 696    * of cache misses.
 697    * <P>
 698    * If the number is set to <CODE>0</CODE>, the entire taxonomy is read
 699    * into the cache on first use, without fetching individual categories
 700    * first.
 701    * <P>
 702    * Note that if the memory cache of choice is limited in size, and cannot
 703    * hold the entire content of the on-disk taxonomy, then it is never
 704    * read in its entirety into the cache, regardless of the setting of this
 705    * method.
 706    */
 707   public void setCacheMissesUntilFill(int i) {
 708     ensureOpen();
 709     cacheMissesUntilFill = i;
 710   }
 711
 712   private int cacheMissesUntilFill = 11;
 713
 714   private boolean perhapsFillCache() throws IOException {
 715     // Note: we assume that we're only called when cacheIsComplete==false.
 716     // TODO (Facet): parametrize this criterion:
 717     if (cacheMisses < cacheMissesUntilFill) {
 718       return false;
 719     }
 720     // If the cache was already filled (or we decided not to fill it because
 721     // there was no room), there is no sense in trying it again.
 722     if (alreadyCalledFillCache) {
 723       return false;
 724     }
 725     alreadyCalledFillCache = true;
 726     // TODO (Facet): we should probably completely clear the cache before starting
 727     // to read it?
 728     if (reader == null) {
 729       reader = openReader();
 730     }
 731
 732     if (!cache.hasRoom(reader.numDocs())) {
 733       return false;
 734     }
 735
 736     CategoryPath cp = new CategoryPath();
 737     TermDocs td = reader.termDocs();
 738     Term fullPathTerm = new Term(Consts.FULL);
 739     String field = fullPathTerm.field(); // needed so we can later use !=
 740     TermEnum terms = reader.terms(fullPathTerm);
 741     // The check is done here to avoid checking it on every iteration of the
 742     // below loop. A null term wlil be returned if there are no terms in the
 743     // lexicon, or after the Consts.FULL term. However while the loop is
 744     // executed we're safe, because we only iterate as long as there are next()
 745     // terms.
 746     if (terms.term() != null) {
 747       do {
 748         Term t = terms.term();
 749         if (t.field() != field) break;
 750         // Since we guarantee uniqueness of categories, each term has exactly
 751         // one document. Also, since we do not allow removing categories (and
 752         // hence documents), there are no deletions in the index. Therefore, it
 753         // is sufficient to call next(), and then doc(), exactly once with no
 754         // 'validation' checks.
 755         td.seek(t);
 756         td.next();
 757         cp.clear();
 758         cp.add(t.text(), delimiter);
 759         cache.put(cp, td.doc());
 760       } while (terms.next());
 761     }
 762
 763     cacheIsComplete = true;
 764     // No sense to keep the reader open - we will not need to read from it
 765     // if everything is in the cache.
 766     reader.close();
 767     reader = null;
 768     return true;
 769   }
 770
 771   private ParentArray parentArray;
 772   private synchronized ParentArray getParentArray() throws IOException {
 773     if (parentArray==null) {
 774       if (reader == null) {
 775         reader = openReader();
 776       }
 777       parentArray = new ParentArray();
 778       parentArray.refresh(reader);
 779     }
 780     return parentArray;
 781   }
 782   public int getParent(int ordinal) throws IOException {
 783     ensureOpen();
 784     // Note: the following if() just enforces that a user can never ask
 785     // for the parent of a nonexistant category - even if the parent array
 786     // was allocated bigger than it really needs to be.
 787     if (ordinal >= getSize()) {
 788       throw new ArrayIndexOutOfBoundsException();
 789     }
 790     return getParentArray().getArray()[ordinal];
 791   }
 792
 793   /**
 794    * Take all the categories of one or more given taxonomies, and add them to
 795    * the main taxonomy (this), if they are not already there.
 796    * <P>
 797    * Additionally, fill a <I>mapping</I> for each of the added taxonomies,
 798    * mapping its ordinals to the ordinals in the enlarged main taxonomy.
 799    * These mapping are saved into an array of OrdinalMap objects given by the
 800    * user, one for each of the given taxonomies (not including "this", the main
 801    * taxonomy). Often the first of these will be a MemoryOrdinalMap and the
 802    * others will be a DiskOrdinalMap - see discussion in {OrdinalMap}.
 803    * <P>
 804    * Note that the taxonomies to be added are given as Directory objects,
 805    * not opened TaxonomyReader/TaxonomyWriter objects, so if any of them are
 806    * currently managed by an open TaxonomyWriter, make sure to commit() (or
 807    * close()) it first. The main taxonomy (this) is an open TaxonomyWriter,
 808    * and does not need to be commit()ed before this call.
 809    */
 810   public void addTaxonomies(Directory[] taxonomies, OrdinalMap[] ordinalMaps) throws IOException {
 811     ensureOpen();
 812     // To prevent us stepping on the rest of this class's decisions on when
 813     // to open a reader, and when not, we'll be opening a new reader instead
 814     // of using the existing "reader" object:
 815     IndexReader mainreader = openReader();
 816     TermEnum mainte = mainreader.terms(new Term(Consts.FULL));
 817
 818     IndexReader[] otherreaders = new IndexReader[taxonomies.length];
 819     TermEnum[] othertes = new TermEnum[taxonomies.length];
 820     for (int i=0; i<taxonomies.length; i++) {
 821       otherreaders[i] = IndexReader.open(taxonomies[i]);
 822       othertes[i] = otherreaders[i].terms(new Term(Consts.FULL));
 823       // Also tell the ordinal maps their expected sizes:
 824       ordinalMaps[i].setSize(otherreaders[i].numDocs());
 825     }
 826
 827     CategoryPath cp = new CategoryPath();
 828
 829     // We keep a "current" cursor over the alphabetically-ordered list of
 830     // categories in each taxonomy. We start the cursor on the first
 831     // (alphabetically) category of each taxonomy:
 832
 833     String currentMain;
 834     String[] currentOthers = new String[taxonomies.length];
 835     currentMain = nextTE(mainte);
 836     int otherTaxonomiesLeft = 0;
 837     for (int i=0; i<taxonomies.length; i++) {
 838       currentOthers[i] = nextTE(othertes[i]);
 839       if (currentOthers[i]!=null) {
 840         otherTaxonomiesLeft++;
 841       }
 842     }
 843
 844     // And then, at each step look at the first (alphabetically) of the
 845     // current taxonomies.
 846     // NOTE: The most efficient way we could have done this is using a
 847     // PriorityQueue. But for simplicity, and assuming that usually we'll
 848     // have a very small number of other taxonomies (often just 1), we use
 849     // a more naive algorithm (o(ntaxonomies) instead of o(ln ntaxonomies)
 850     // per step)
 851
 852     while (otherTaxonomiesLeft>0) {
 853       String first=null;
 854       for (int i=0; i<taxonomies.length; i++) {
 855         if (currentOthers[i]==null) continue;
 856         if (first==null || first.compareTo(currentOthers[i])>0) {
 857           first = currentOthers[i];
 858         }
 859       }
 860       int comp = 0;
 861       if (currentMain==null || (comp = currentMain.compareTo(first))>0) {
 862         // If 'first' is before currentMain, or currentMain is null,
 863         // then 'first' is a new category and we need to add it to the
 864         // main taxonomy. Then for all taxonomies with this 'first'
 865         // category, we need to add the new category number to their
 866         // map, and move to the next category in all of them.
 867         cp.clear();
 868         cp.add(first, delimiter);
 869         // We can call internalAddCategory() instead of addCategory()
 870         // because we know the category hasn't been seen yet.
 871         int newordinal = internalAddCategory(cp, cp.length());
 872         // TODO (Facet): we already had this term in our hands before, in nextTE...
 873         Term t = new Term(Consts.FULL, first);
 874         for (int i=0; i<taxonomies.length; i++) {
 875           if (first.equals(currentOthers[i])) {
 876             // remember the remapping of this ordinal. Note how
 877             // this requires reading a posting list from the index -
 878             // but since we do this in lexical order of terms, just
 879             // like Lucene's merge works, we hope there are few seeks.
 880             // TODO (Facet): is there a quicker way? E.g., not specifying the
 881             // next term by name every time?
 882             TermDocs td = otherreaders[i].termDocs(t);
 883             td.next(); // TODO (Facet): check?
 884             int origordinal = td.doc();
 885             ordinalMaps[i].addMapping(origordinal, newordinal);
 886             // and move to the next category in the i'th taxonomy
 887             currentOthers[i] = nextTE(othertes[i]);
 888             if (currentOthers[i]==null) {
 889               otherTaxonomiesLeft--;
 890             }
 891           }
 892         }
 893       } else if (comp==0) {
 894         // 'first' and currentMain are the same, so both the main and some
 895         // other taxonomies need to be moved, but a category doesn't need
 896         // to be added because it already existed in the main taxonomy.
 897
 898         // TODO (Facet): Again, is there a quicker way?
 899         Term t = new Term(Consts.FULL, first);
 900         TermDocs td = mainreader.termDocs(t);
 901         td.next(); // TODO (Facet): check?
 902         int newordinal = td.doc();
 903
 904         currentMain = nextTE(mainte);
 905         for (int i=0; i<taxonomies.length; i++) {
 906           if (first.equals(currentOthers[i])) {
 907             // TODO (Facet): again, is there a quicker way?
 908             td = otherreaders[i].termDocs(t);
 909             td.next(); // TODO (Facet): check?
 910             int origordinal = td.doc();
 911             ordinalMaps[i].addMapping(origordinal, newordinal);
 912
 913             // and move to the next category
 914             currentOthers[i] = nextTE(othertes[i]);
 915             if (currentOthers[i]==null) {
 916               otherTaxonomiesLeft--;
 917             }
 918           }
 919         }
 920       } else /* comp > 0 */ {
 921         // The currentMain doesn't appear in any of the other taxonomies -
 922         // we don't need to do anything, just continue to the next one
 923         currentMain = nextTE(mainte);
 924       }
 925     }
 926
 927     // Close all the readers we've opened, and also tell the ordinal maps
 928     // we're done adding to them
 929     mainreader.close();
 930     for (int i=0; i<taxonomies.length; i++) {
 931       otherreaders[i].close();
 932       // We never actually added a mapping for the root ordinal - let's do
 933       // it now, just so that the map is complete (every ordinal between 0
 934       // and size-1 is remapped)
 935       ordinalMaps[i].addMapping(0, 0);
 936       ordinalMaps[i].addDone();
 937     }
 938   }
 939
 940   /**
 941    * Mapping from old ordinal to new ordinals, used when merging indexes
 942    * wit separate taxonomies.
 943    * <p>
 944    * addToTaxonomies() merges one or more taxonomies into the given taxonomy
 945    * (this). An OrdinalMap is filled for each of the added taxonomies,
 946    * containing the new ordinal (in the merged taxonomy) of each of the
 947    * categories in the old taxonomy.
 948    * <P>
 949    * There exist two implementations of OrdinalMap: MemoryOrdinalMap and
 950    * DiskOrdinalMap. As their names suggest, the former keeps the map in
 951    * memory and the latter in a temporary disk file. Because these maps will
 952    * later be needed one by one (to remap the counting lists), not all at the
 953    * same time, it is recommended to put the first taxonomy's map in memory,
 954    * and all the rest on disk (later to be automatically read into memory one
 955    * by one, when needed).
 956    */
 957   public static interface OrdinalMap {
 958     /**
 959      * Set the size of the map. This MUST be called before addMapping().
 960      * It is assumed (but not verified) that addMapping() will then be
 961      * called exactly 'size' times, with different origOrdinals between 0
 962      * and size-1.
 963      */
 964     public void setSize(int size) throws IOException;
 965     public void addMapping(int origOrdinal, int newOrdinal) throws IOException;
 966     /**
 967      * Call addDone() to say that all addMapping() have been done.
 968      * In some implementations this might free some resources.
 969      */
 970     public void addDone() throws IOException;
 971     /**
 972      * Return the map from the taxonomy's original (consecutive) ordinals
 973      * to the new taxonomy's ordinals. If the map has to be read from disk
 974      * and ordered appropriately, it is done when getMap() is called.
 975      * getMap() should only be called once, and only when the map is actually
 976      * needed. Calling it will also free all resources that the map might
 977      * be holding (such as temporary disk space), other than the returned int[].
 978      */
 979     public int[] getMap() throws IOException;
 980   }
 981
 982   /**
 983    * {@link OrdinalMap} maintained in memory
 984    */
 985   public static final class MemoryOrdinalMap implements OrdinalMap {
 986     int[] map;
 987     public void setSize(int taxonomySize) {
 988       map = new int[taxonomySize];
 989     }
 990     public void addMapping(int origOrdinal, int newOrdinal) {
 991       map[origOrdinal] = newOrdinal;
 992     }
 993     public void addDone() { /* nothing to do */ }
 994     public int[] getMap() {
 995       return map;
 996     }
 997   }
 998
 999   /**
1000    * {@link OrdinalMap} maintained on file system
1001    */
1002   public static final class DiskOrdinalMap implements OrdinalMap {
1003     File tmpfile;
1004     DataOutputStream out;
1005
1006     public DiskOrdinalMap(File tmpfile) throws FileNotFoundException {
1007       this.tmpfile = tmpfile;
1008       out = new DataOutputStream(new BufferedOutputStream(
1009           new FileOutputStream(tmpfile)));
1010     }
1011
1012     public void addMapping(int origOrdinal, int newOrdinal) throws IOException {
1013       out.writeInt(origOrdinal);
1014       out.writeInt(newOrdinal);
1015     }
1016
1017     public void setSize(int taxonomySize) throws IOException {
1018       out.writeInt(taxonomySize);
1019     }
1020
1021     public void addDone() throws IOException {
1022       if (out!=null) {
1023         out.close();
1024         out = null;
1025       }
1026     }
1027
1028     int[] map = null;
1029
1030     public int[] getMap() throws IOException {
1031       if (map!=null) {
1032         return map;
1033       }
1034       addDone(); // in case this wasn't previously called
1035       DataInputStream in = new DataInputStream(new BufferedInputStream(
1036           new FileInputStream(tmpfile)));
1037       map = new int[in.readInt()];
1038       // NOTE: The current code assumes here that the map is complete,
1039       // i.e., every ordinal gets one and exactly one value. Otherwise,
1040       // we may run into an EOF here, or vice versa, not read everything.
1041       for (int i=0; i<map.length; i++) {
1042         int origordinal = in.readInt();
1043         int newordinal = in.readInt();
1044         map[origordinal] = newordinal;
1045       }
1046       in.close();
1047       // Delete the temporary file, which is no longer needed.
1048       if (!tmpfile.delete()) {
1049         tmpfile.deleteOnExit();
1050       }
1051       return map;
1052     }
1053   }
1054
1055   private static final String nextTE(TermEnum te) throws IOException {
1056     if (te.next()) {
1057       Term t = te.term();
1058       // If our enumeration reached a different field, we're done. Note
1059       // how we're allowed compare string references, rather than the
1060       // actual string's contents.
1061       if (t.field()==Consts.FULL) {
1062         return t.text();
1063       }
1064       return null;
1065     }
1066     return null;
1067   }
1068
1069   /**
1070    * Rollback changes to the taxonomy writer and closes the instance. Following
1071    * this method the instance becomes unusable (calling any of its API methods
1072    * will yield an {@link AlreadyClosedException}).
1073    */
1074   public void rollback() throws IOException {
1075     ensureOpen();
1076     indexWriter.rollback();
1077     // since IndexWriter.rollback() closes the IW instance, we should close too.
1078     doClose();
1079   }
1080
1081 }