lucene-java-3.5.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java

   1 package org.apache.lucene.facet.taxonomy.directory;
   2
   3 import java.io.IOException;
   4 import java.util.Iterator;
   5 import java.util.Map;
   6 import java.util.Map.Entry;
   7 import java.util.concurrent.locks.ReadWriteLock;
   8 import java.util.concurrent.locks.ReentrantReadWriteLock;
   9 import java.util.logging.Level;
  10 import java.util.logging.Logger;
  11
  12 import org.apache.lucene.facet.taxonomy.CategoryPath;
  13 import org.apache.lucene.facet.taxonomy.InconsistentTaxonomyException;
  14 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
  15 import org.apache.lucene.index.CorruptIndexException;
  16 import org.apache.lucene.index.IndexReader;
  17 import org.apache.lucene.index.Term;
  18 import org.apache.lucene.index.TermDocs;
  19 import org.apache.lucene.store.AlreadyClosedException;
  20 import org.apache.lucene.store.Directory;
  21 import org.apache.lucene.util.collections.LRUHashMap;
  22
  23 /**
  24  * Licensed to the Apache Software Foundation (ASF) under one or more
  25  * contributor license agreements.  See the NOTICE file distributed with
  26  * this work for additional information regarding copyright ownership.
  27  * The ASF licenses this file to You under the Apache License, Version 2.0
  28  * (the "License"); you may not use this file except in compliance with
  29  * the License.  You may obtain a copy of the License at
  30  *
  31  *     http://www.apache.org/licenses/LICENSE-2.0
  32  *
  33  * Unless required by applicable law or agreed to in writing, software
  34  * distributed under the License is distributed on an "AS IS" BASIS,
  35  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  36  * See the License for the specific language governing permissions and
  37  * limitations under the License.
  38  */
  39
  40 /**
  41  * A {@link TaxonomyReader} which retrieves stored taxonomy information from a
  42  * {@link Directory}.
  43  * <P>
  44  * Reading from the on-disk index on every method call is too slow, so this
  45  * implementation employs caching: Some methods cache recent requests and their
  46  * results, while other methods prefetch all the data into memory and then
  47  * provide answers directly from in-memory tables. See the documentation of
  48  * individual methods for comments on their performance.
  49  *
  50  * @lucene.experimental
  51  */
  52 public class DirectoryTaxonomyReader implements TaxonomyReader {
  53
  54   private static final Logger logger = Logger.getLogger(DirectoryTaxonomyReader.class.getName());
  55
  56   private IndexReader indexReader;
  57
  58   // The following lock is used to allow multiple threads to read from the
  59   // index concurrently, while having them block during the very short
  60   // critical moment of refresh() (see comments below). Note, however, that
  61   // we only read from the index when we don't have the entry in our cache,
  62   // and the caches are locked separately.
  63   private ReadWriteLock indexReaderLock = new ReentrantReadWriteLock();
  64
  65   // The following are the limited-size LRU caches used to cache the latest
  66   // results from getOrdinal() and getLabel().
  67   // Because LRUHashMap is not thread-safe, we need to synchronize on this
  68   // object when using it. Unfortunately, this is not optimal under heavy
  69   // contention because it means that while one thread is using the cache
  70   // (reading or modifying) others are blocked from using it - or even
  71   // starting to do benign things like calculating the hash function. A more
  72   // efficient approach would be to use a non-locking (as much as possible)
  73   // concurrent solution, along the lines of java.util.concurrent.ConcurrentHashMap
  74   // but with LRU semantics.
  75   // However, even in the current sub-optimal implementation we do not make
  76   // the mistake of locking out readers while waiting for disk in a cache
  77   // miss - below, we do not hold cache lock while reading missing data from
  78   // disk.
  79   private final LRUHashMap<String, Integer> ordinalCache;
  80   private final LRUHashMap<Integer, String> categoryCache;
  81
  82   // getParent() needs to be extremely efficient, to the point that we need
  83   // to fetch all the data in advance into memory, and answer these calls
  84   // from memory. Currently we use a large integer array, which is
  85   // initialized when the taxonomy is opened, and potentially enlarged
  86   // when it is refresh()ed.
  87   // These arrays are not syncrhonized. Rather, the reference to the array
  88   // is volatile, and the only writing operation (refreshPrefetchArrays)
  89   // simply creates a new array and replaces the reference. The volatility
  90   // of the reference ensures the correct atomic replacement and its
  91   // visibility properties (the content of the array is visible when the
  92   // new reference is visible).
  93   private ParentArray parentArray;
  94
  95   private char delimiter = Consts.DEFAULT_DELIMITER;
  96
  97   private volatile boolean closed = false;
  98
  99   /**
 100    * Open for reading a taxonomy stored in a given {@link Directory}.
 101    * @param directory
 102    *    The {@link Directory} in which to the taxonomy lives. Note that
 103    *    the taxonomy is read directly to that directory (not from a
 104    *    subdirectory of it).
 105    * @throws CorruptIndexException if the Taxonomy is corrupted.
 106    * @throws IOException if another error occurred.
 107    */
 108   public DirectoryTaxonomyReader(Directory directory) throws IOException {
 109     this.indexReader = openIndexReader(directory);
 110
 111     // These are the default cache sizes; they can be configured after
 112     // construction with the cache's setMaxSize() method
 113     ordinalCache = new LRUHashMap<String, Integer>(4000);
 114     categoryCache = new LRUHashMap<Integer, String>(4000);
 115
 116     // TODO (Facet): consider lazily create parent array when asked, not in the constructor
 117     parentArray = new ParentArray();
 118     parentArray.refresh(indexReader);
 119   }
 120
 121   protected IndexReader openIndexReader(Directory directory) throws CorruptIndexException, IOException {
 122     return IndexReader.open(directory);
 123   }
 124
 125   /**
 126    * @throws AlreadyClosedException if this IndexReader is closed
 127    */
 128   protected final void ensureOpen() throws AlreadyClosedException {
 129     if (indexReader.getRefCount() <= 0) {
 130       throw new AlreadyClosedException("this TaxonomyReader is closed");
 131     }
 132   }
 133
 134   /**
 135    * setCacheSize controls the maximum allowed size of each of the caches
 136    * used by {@link #getPath(int)} and {@link #getOrdinal(CategoryPath)}.
 137    * <P>
 138    * Currently, if the given size is smaller than the current size of
 139    * a cache, it will not shrink, and rather we be limited to its current
 140    * size.
 141    * @param size the new maximum cache size, in number of entries.
 142    */
 143   public void setCacheSize(int size) {
 144     ensureOpen();
 145     synchronized(categoryCache) {
 146       categoryCache.setMaxSize(size);
 147     }
 148     synchronized(ordinalCache) {
 149       ordinalCache.setMaxSize(size);
 150     }
 151   }
 152
 153   /**
 154    * setDelimiter changes the character that the taxonomy uses in its
 155    * internal storage as a delimiter between category components. Do not
 156    * use this method unless you really know what you are doing.
 157    * <P>
 158    * If you do use this method, make sure you call it before any other
 159    * methods that actually queries the taxonomy. Moreover, make sure you
 160    * always pass the same delimiter for all LuceneTaxonomyWriter and
 161    * LuceneTaxonomyReader objects you create.
 162    */
 163   public void setDelimiter(char delimiter) {
 164     ensureOpen();
 165     this.delimiter = delimiter;
 166   }
 167
 168   public int getOrdinal(CategoryPath categoryPath) throws IOException {
 169     ensureOpen();
 170     if (categoryPath.length()==0) {
 171       return ROOT_ORDINAL;
 172     }
 173     String path = categoryPath.toString(delimiter);
 174
 175     // First try to find the answer in the LRU cache:
 176     synchronized(ordinalCache) {
 177       Integer res = ordinalCache.get(path);
 178       if (res!=null) {
 179         return res.intValue();
 180       }
 181     }
 182
 183     // If we're still here, we have a cache miss. We need to fetch the
 184     // value from disk, and then also put it in the cache:
 185     int ret = TaxonomyReader.INVALID_ORDINAL;
 186     try {
 187       indexReaderLock.readLock().lock();
 188       TermDocs docs = indexReader.termDocs(new Term(Consts.FULL, path));
 189       if (docs.next()) {
 190         ret = docs.doc();
 191       }
 192     } finally {
 193       indexReaderLock.readLock().unlock();
 194     }
 195
 196     // Put the new value in the cache. Note that it is possible that while
 197     // we were doing the above fetching (without the cache locked), some
 198     // other thread already added the same category to the cache. We do
 199     // not care about this possibilty, as LRUCache replaces previous values
 200     // of the same keys (it doesn't store duplicates).
 201     synchronized(ordinalCache) {
 202       // GB: new Integer(int); creates a new object each and every time.
 203       // Integer.valueOf(int) might not (See JavaDoc).
 204       ordinalCache.put(path, Integer.valueOf(ret));
 205     }
 206
 207     return ret;
 208   }
 209
 210   public CategoryPath getPath(int ordinal) throws CorruptIndexException, IOException {
 211     ensureOpen();
 212     // TODO (Facet): Currently, the LRU cache we use (categoryCache) holds
 213     // strings with delimiters, not CategoryPath objects, so even if
 214     // we have a cache hit, we need to process the string and build a new
 215     // CategoryPath object every time. What is preventing us from putting
 216     // the actual CategoryPath object in the cache is the fact that these
 217     // objects are mutable. So we should create an immutable (read-only)
 218     // interface that CategoryPath implements, and this method should
 219     // return this interface, not the writable CategoryPath.
 220     String label = getLabel(ordinal);
 221     if (label==null) {
 222       return null;
 223     }
 224     return new CategoryPath(label, delimiter);
 225   }
 226
 227   public boolean getPath(int ordinal, CategoryPath result) throws CorruptIndexException, IOException {
 228     ensureOpen();
 229     String label = getLabel(ordinal);
 230     if (label==null) {
 231       return false;
 232     }
 233     result.clear();
 234     result.add(label, delimiter);
 235     return true;
 236   }
 237
 238   private String getLabel(int catID) throws CorruptIndexException, IOException {
 239     ensureOpen();
 240     // First try to find the answer in the LRU cache. It is very
 241     // unfortunate that we need to allocate an Integer object here -
 242     // it would have been better if we used a hash table specifically
 243     // designed for int keys...
 244     // GB: new Integer(int); creates a new object each and every time.
 245     // Integer.valueOf(int) might not (See JavaDoc).
 246     Integer catIDInteger = Integer.valueOf(catID);
 247
 248     synchronized(categoryCache) {
 249       String res = categoryCache.get(catIDInteger);
 250       if (res!=null) {
 251         return res;
 252       }
 253     }
 254
 255     // If we're still here, we have a cache miss. We need to fetch the
 256     // value from disk, and then also put it in the cache:
 257     String ret;
 258     try {
 259       indexReaderLock.readLock().lock();
 260       // The taxonomy API dictates that if we get an invalid category
 261       // ID, we should return null, If we don't check this here, we
 262       // can some sort of an exception from the document() call below.
 263       // NOTE: Currently, we *do not* cache this return value; There
 264       // isn't much point to do so, because checking the validity of
 265       // the docid doesn't require disk access - just comparing with
 266       // the number indexReader.maxDoc().
 267       if (catID<0 || catID>=indexReader.maxDoc()) {
 268         return null;
 269       }
 270       ret = indexReader.document(catID, Consts.fullPathSelector)
 271       .get(Consts.FULL);
 272     } finally {
 273       indexReaderLock.readLock().unlock();
 274     }
 275     // Put the new value in the cache. Note that it is possible that while
 276     // we were doing the above fetching (without the cache locked), some
 277     // other thread already added the same category to the cache. We do
 278     // not care about this possibility, as LRUCache replaces previous
 279     // values of the same keys (it doesn't store duplicates).
 280     synchronized (categoryCache) {
 281       categoryCache.put(catIDInteger, ret);
 282     }
 283
 284     return ret;
 285   }
 286
 287   public int getParent(int ordinal) {
 288     ensureOpen();
 289     // Note how we don't need to hold the read lock to do the following,
 290     // because the array reference is volatile, ensuring the correct
 291     // visibility and ordering: if we get the new reference, the new
 292     // data is also visible to this thread.
 293     return getParentArray()[ordinal];
 294   }
 295
 296   /**
 297    * getParentArray() returns an int array of size getSize() listing the
 298    * ordinal of the parent category of each category in the taxonomy.
 299    * <P>
 300    * The caller can hold on to the array it got indefinitely - it is
 301    * guaranteed that no-one else will modify it. The other side of the
 302    * same coin is that the caller must treat the array it got as read-only
 303    * and <B>not modify it</B>, because other callers might have gotten the
 304    * same array too, and getParent() calls are also answered from the
 305    * same array.
 306    * <P>
 307    * The getParentArray() call is extremely efficient, merely returning
 308    * a reference to an array that already exists. For a caller that plans
 309    * to call getParent() for many categories, using getParentArray() and
 310    * the array it returns is a somewhat faster approach because it avoids
 311    * the overhead of method calls and volatile dereferencing.
 312    * <P>
 313    * If you use getParentArray() instead of getParent(), remember that
 314    * the array you got is (naturally) not modified after a refresh(),
 315    * so you should always call getParentArray() again after a refresh().
 316    */
 317
 318   public int[] getParentArray() {
 319     ensureOpen();
 320     // Note how we don't need to hold the read lock to do the following,
 321     // because the array reference is volatile, ensuring the correct
 322     // visibility and ordering: if we get the new reference, the new
 323     // data is also visible to this thread.
 324     return parentArray.getArray();
 325   }
 326
 327   // Note that refresh() is synchronized (it is the only synchronized
 328   // method in this class) to ensure that it never gets called concurrently
 329   // with itself.
 330   public synchronized boolean refresh() throws IOException, InconsistentTaxonomyException {
 331     ensureOpen();
 332     /*
 333      * Since refresh() can be a lengthy operation, it is very important that we
 334      * avoid locking out all readers for its duration. This is why we don't hold
 335      * the indexReaderLock write lock for the entire duration of this method. In
 336      * fact, it is enough to hold it only during a single assignment! Other
 337      * comments in this method will explain this.
 338      */
 339
 340     // note that the lengthy operation indexReader.reopen() does not
 341     // modify the reader, so we can do it without holding a lock. We can
 342     // safely read indexReader without holding the write lock, because
 343     // no other thread can be writing at this time (this method is the
 344     // only possible writer, and it is "synchronized" to avoid this case).
 345     IndexReader r2 = IndexReader.openIfChanged(indexReader);
 346     if (r2 == null) {
 347         return false; // no changes, nothing to do
 348     }
 349
 350     // validate that a refresh is valid at this point, i.e. that the taxonomy
 351     // was not recreated since this reader was last opened or refresshed.
 352     String t1 = indexReader.getCommitUserData().get(DirectoryTaxonomyWriter.INDEX_CREATE_TIME);
 353     String t2 = r2.getCommitUserData().get(DirectoryTaxonomyWriter.INDEX_CREATE_TIME);
 354     if (t1==null) {
 355         if (t2!=null) {
 356                 r2.close();
 357                 throw new InconsistentTaxonomyException("Taxonomy was recreated at: "+t2);
 358         }
 359     } else if (!t1.equals(t2)) {
 360         r2.close();
 361         throw new InconsistentTaxonomyException("Taxonomy was recreated at: "+t2+"  !=  "+t1);
 362     }
 363
 364       IndexReader oldreader = indexReader;
 365       // we can close the old searcher, but need to synchronize this
 366       // so that we don't close it in the middle that another routine
 367       // is reading from it.
 368       indexReaderLock.writeLock().lock();
 369       indexReader = r2;
 370       indexReaderLock.writeLock().unlock();
 371       // We can close the old reader, but need to be certain that we
 372       // don't close it while another method is reading from it.
 373       // Luckily, we can be certain of that even without putting the
 374       // oldreader.close() in the locked section. The reason is that
 375       // after lock() succeeded above, we know that all existing readers
 376       // had finished (this is what a read-write lock ensures). New
 377       // readers, starting after the unlock() we just did, already got
 378       // the new indexReader we set above. So nobody can be possibly
 379       // using the old indexReader, and we can close it:
 380       oldreader.close();
 381
 382       // We prefetch some of the arrays to make requests much faster.
 383       // Let's refresh these prefetched arrays; This refresh is much
 384       // is made more efficient by assuming that it is enough to read
 385       // the values for new categories (old categories could not have been
 386       // changed or deleted)
 387       // Note that this this done without the write lock being held,
 388       // which means that it is possible that during a refresh(), a
 389       // reader will have some methods (like getOrdinal and getCategory)
 390       // return fresh information, while getParent()
 391       // (only to be prefetched now) still return older information.
 392       // We consider this to be acceptable. The important thing,
 393       // however, is that refreshPrefetchArrays() itself writes to
 394       // the arrays in a correct manner (see discussion there)
 395       parentArray.refresh(indexReader);
 396
 397       // Remove any INVALID_ORDINAL values from the ordinal cache,
 398       // because it is possible those are now answered by the new data!
 399       Iterator<Entry<String, Integer>> i = ordinalCache.entrySet().iterator();
 400       while (i.hasNext()) {
 401         Entry<String, Integer> e = i.next();
 402         if (e.getValue().intValue() == INVALID_ORDINAL) {
 403           i.remove();
 404         }
 405       }
 406       return true;
 407     }
 408
 409   public void close() throws IOException {
 410     if (!closed) {
 411       decRef();
 412       closed = true;
 413     }
 414   }
 415
 416   /** Do the actual closing, free up resources */
 417   private void doClose() throws IOException {
 418     indexReader.close();
 419     closed = true;
 420
 421     parentArray = null;
 422     childrenArrays = null;
 423     categoryCache.clear();
 424     ordinalCache.clear();
 425   }
 426
 427   public int getSize() {
 428     ensureOpen();
 429     indexReaderLock.readLock().lock();
 430     try {
 431       return indexReader.numDocs();
 432     } finally {
 433       indexReaderLock.readLock().unlock();
 434     }
 435   }
 436
 437   public Map<String, String> getCommitUserData() {
 438     ensureOpen();
 439     return indexReader.getCommitUserData();
 440   }
 441
 442   private ChildrenArrays childrenArrays;
 443   Object childrenArraysRebuild = new Object();
 444
 445   public ChildrenArrays getChildrenArrays() {
 446     ensureOpen();
 447     // Check if the taxonomy grew since we built the array, and if it
 448     // did, create new (and larger) arrays and fill them as required.
 449     // We do all this under a lock, two prevent to concurrent calls to
 450     // needlessly do the same array building at the same time.
 451     synchronized(childrenArraysRebuild) {
 452       int num = getSize();
 453       int first;
 454       if (childrenArrays==null) {
 455         first = 0;
 456       } else {
 457         first = childrenArrays.getYoungestChildArray().length;
 458       }
 459       // If the taxonomy hasn't grown, we can return the existing object
 460       // immediately
 461       if (first == num) {
 462         return childrenArrays;
 463       }
 464       // Otherwise, build new arrays for a new ChildrenArray object.
 465       // These arrays start with an enlarged copy of the previous arrays,
 466       // and then are modified to take into account the new categories:
 467       int[] newYoungestChildArray = new int[num];
 468       int[] newOlderSiblingArray = new int[num];
 469       // In Java 6, we could just do Arrays.copyOf()...
 470       if (childrenArrays!=null) {
 471         System.arraycopy(childrenArrays.getYoungestChildArray(), 0,
 472             newYoungestChildArray, 0, childrenArrays.getYoungestChildArray().length);
 473         System.arraycopy(childrenArrays.getOlderSiblingArray(), 0,
 474             newOlderSiblingArray, 0, childrenArrays.getOlderSiblingArray().length);
 475       }
 476       int[] parents = getParentArray();
 477       for (int i=first; i<num; i++) {
 478         newYoungestChildArray[i] = INVALID_ORDINAL;
 479       }
 480       // In the loop below we can ignore the root category (0) because
 481       // it has no parent
 482       if (first==0) {
 483         first = 1;
 484         newOlderSiblingArray[0] = INVALID_ORDINAL;
 485       }
 486       for (int i=first; i<num; i++) {
 487         // Note that parents[i] is always < i, so the right-hand-side of
 488         // the following line is already set when we get here.
 489         newOlderSiblingArray[i] = newYoungestChildArray[parents[i]];
 490         newYoungestChildArray[parents[i]] = i;
 491       }
 492       // Finally switch to the new arrays
 493       childrenArrays = new ChildrenArraysImpl(newYoungestChildArray,
 494           newOlderSiblingArray);
 495       return childrenArrays;
 496     }
 497   }
 498
 499   public String toString(int max) {
 500     ensureOpen();
 501     StringBuilder sb = new StringBuilder();
 502     int upperl = Math.min(max, this.indexReader.maxDoc());
 503     for (int i = 0; i < upperl; i++) {
 504       try {
 505         CategoryPath category = this.getPath(i);
 506         if (category == null) {
 507           sb.append(i + ": NULL!! \n");
 508           continue;
 509         }
 510         if (category.length() == 0) {
 511           sb.append(i + ": EMPTY STRING!! \n");
 512           continue;
 513         }
 514         sb.append(i +": "+category.toString()+"\n");
 515       } catch (IOException e) {
 516         if (logger.isLoggable(Level.FINEST)) {
 517           logger.log(Level.FINEST, e.getMessage(), e);
 518         }
 519       }
 520     }
 521     return sb.toString();
 522   }
 523
 524   private static final class ChildrenArraysImpl implements ChildrenArrays {
 525     private int[] youngestChildArray, olderSiblingArray;
 526     public ChildrenArraysImpl(int[] youngestChildArray, int[] olderSiblingArray) {
 527       this.youngestChildArray = youngestChildArray;
 528       this.olderSiblingArray = olderSiblingArray;
 529     }
 530     public int[] getOlderSiblingArray() {
 531       return olderSiblingArray;
 532     }
 533     public int[] getYoungestChildArray() {
 534       return youngestChildArray;
 535     }
 536   }
 537
 538   /**
 539    * Expert:  This method is only for expert use.
 540    * Note also that any call to refresh() will invalidate the returned reader,
 541    * so the caller needs to take care of appropriate locking.
 542    *
 543    * @return lucene indexReader
 544    */
 545   IndexReader getInternalIndexReader() {
 546     ensureOpen();
 547     return this.indexReader;
 548   }
 549
 550   /**
 551    * Expert: decreases the refCount of this TaxonomyReader instance.
 552    * If the refCount drops to 0, then pending changes (if any) are
 553    * committed to the taxonomy index and this reader is closed.
 554    * @throws IOException
 555    */
 556   public void decRef() throws IOException {
 557     ensureOpen();
 558     if (indexReader.getRefCount() == 1) {
 559       // Do not decRef the indexReader - doClose does it by calling reader.close()
 560       doClose();
 561     } else {
 562       indexReader.decRef();
 563     }
 564   }
 565
 566   /**
 567    * Expert: returns the current refCount for this taxonomy reader
 568    */
 569   public int getRefCount() {
 570     ensureOpen();
 571     return this.indexReader.getRefCount();
 572   }
 573
 574   /**
 575    * Expert: increments the refCount of this TaxonomyReader instance.
 576    * RefCounts are used to determine when a taxonomy reader can be closed
 577    * safely, i.e. as soon as there are no more references.
 578    * Be sure to always call a corresponding decRef(), in a finally clause;
 579    * otherwise the reader may never be closed.
 580    */
 581   public void incRef() {
 582     ensureOpen();
 583     this.indexReader.incRef();
 584   }
 585 }