lucene-java-3.4.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/LuceneTaxonomyReader.java

   1 package org.apache.lucene.facet.taxonomy.lucene;
   2
   3 import java.io.File;
   4 import java.io.IOException;
   5 import java.util.Iterator;
   6 import java.util.Map;
   7 import java.util.Map.Entry;
   8 import java.util.concurrent.locks.ReadWriteLock;
   9 import java.util.concurrent.locks.ReentrantReadWriteLock;
  10 import java.util.logging.Level;
  11 import java.util.logging.Logger;
  12
  13 import org.apache.lucene.index.CorruptIndexException;
  14 import org.apache.lucene.index.IndexReader;
  15 import org.apache.lucene.index.Term;
  16 import org.apache.lucene.index.TermDocs;
  17 import org.apache.lucene.store.Directory;
  18 import org.apache.lucene.store.FSDirectory;
  19
  20 import org.apache.lucene.facet.taxonomy.CategoryPath;
  21 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
  22 import org.apache.lucene.util.collections.LRUHashMap;
  23
  24 /**
  25  * Licensed to the Apache Software Foundation (ASF) under one or more
  26  * contributor license agreements.  See the NOTICE file distributed with
  27  * this work for additional information regarding copyright ownership.
  28  * The ASF licenses this file to You under the Apache License, Version 2.0
  29  * (the "License"); you may not use this file except in compliance with
  30  * the License.  You may obtain a copy of the License at
  31  *
  32  *     http://www.apache.org/licenses/LICENSE-2.0
  33  *
  34  * Unless required by applicable law or agreed to in writing, software
  35  * distributed under the License is distributed on an "AS IS" BASIS,
  36  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  37  * See the License for the specific language governing permissions and
  38  * limitations under the License.
  39  */
  40
  41 /**
  42  * LuceneTaxonomyReader is a {@link TaxonomyReader} which retrieves stored
  43  * taxonomy information from a separate Lucene index. By using a Lucene index,
  44  * rather than some specialized file format, we get for "free" its correctness
  45  * (especially regarding concurrency), and the ability to save it on any
  46  * implementation of Directory (and not just the file system).
  47  * <P>
  48  * Reading from the on-disk index on every method call is too slow, so this
  49  * implementation employs caching: Some methods cache recent requests and
  50  * their results, while other methods prefetch all the data into memory
  51  * and then provide answers directly from in-memory tables. See the
  52  * documentation of individual methods for comments on their performance.
  53  *
  54  * @lucene.experimental
  55  */
  56 public class LuceneTaxonomyReader implements TaxonomyReader {
  57
  58   private static final Logger logger = Logger.getLogger(LuceneTaxonomyReader.class.getName());
  59
  60   private IndexReader indexReader;
  61
  62   // The following lock is used to allow multiple threads to read from the
  63   // index concurrently, while having them block during the very short
  64   // critical moment of refresh() (see comments below). Note, however, that
  65   // we only read from the index when we don't have the entry in our cache,
  66   // and the caches are locked separately.
  67   private ReadWriteLock indexReaderLock = new ReentrantReadWriteLock();
  68
  69   // The following are the limited-size LRU caches used to cache the latest
  70   // results from getOrdinal() and getCategoryCache().
  71   // Because LRUHashMap is not thread-safe, we need to synchronize on this
  72   // object when using it. Unfortunately, this is not optimal under heavy
  73   // contention because it means that while one thread is using the cache
  74   // (reading or modifying) others are blocked from using it - or even
  75   // starting to do benign things like calculating the hash function. A more
  76   // efficient approach would be to use a non-locking (as much as possible)
  77   // concurrent solution, along the lines of java.util.concurrent.ConcurrentHashMap
  78   // but with LRU semantics.
  79   // However, even in the current sub-optimal implementation we do not make
  80   // the mistake of locking out readers while waiting for disk in a cache
  81   // miss - below, we do not hold cache lock while reading missing data from
  82   // disk.
  83   private final LRUHashMap<String, Integer> getOrdinalCache;
  84   private final LRUHashMap<Integer, String> getCategoryCache;
  85
  86   // getParent() needs to be extremely efficient, to the point that we need
  87   // to fetch all the data in advance into memory, and answer these calls
  88   // from memory. Currently we use a large integer array, which is
  89   // initialized when the taxonomy is opened, and potentially enlarged
  90   // when it is refresh()ed.
  91   // These arrays are not syncrhonized. Rather, the reference to the array
  92   // is volatile, and the only writing operation (refreshPrefetchArrays)
  93   // simply creates a new array and replaces the reference. The volatility
  94   // of the reference ensures the correct atomic replacement and its
  95   // visibility properties (the content of the array is visible when the
  96   // new reference is visible).
  97   private ParentArray parentArray;
  98
  99   private char delimiter = Consts.DEFAULT_DELIMITER;
 100
 101   /**
 102    * Open for reading a taxonomy stored in a given {@link Directory}.
 103    * @param directory
 104    *    The {@link Directory} in which to the taxonomy lives. Note that
 105    *    the taxonomy is read directly to that directory (not from a
 106    *    subdirectory of it).
 107    * @throws CorruptIndexException if the Taxonomy is corrupted.
 108    * @throws IOException if another error occurred.
 109    */
 110   public LuceneTaxonomyReader(Directory directory)
 111   throws CorruptIndexException, IOException {
 112     this.indexReader = openIndexReader(directory);
 113
 114     // These are the default cache sizes; they can be configured after
 115     // construction with the cache's setMaxSize() method
 116     getOrdinalCache = new LRUHashMap<String, Integer>(4000);
 117     getCategoryCache = new LRUHashMap<Integer, String>(4000);
 118
 119     // TODO (Facet): consider lazily create parent array it when asked, not in the constructor
 120     parentArray = new ParentArray();
 121     parentArray.refresh(indexReader);
 122   }
 123
 124   protected IndexReader openIndexReader(Directory directory) throws CorruptIndexException, IOException {
 125     return IndexReader.open(directory);
 126   }
 127
 128   // convenience constructors... deprecated because they cause confusion
 129   // because they use parent directory instead of the actual directory.
 130   private Directory ourDirectory = null; // remember directory to close later, but only if we opened it here
 131   /**
 132    * Open for reading a taxonomy stored in a subdirectory of a given
 133    * directory on the file system.
 134    * @param parentDir The parent directory of the taxonomy's directory
 135    * (usually this would be the directory holding the index).
 136    * @param name The name of the taxonomy, and the subdirectory holding it.
 137    * @throws CorruptIndexException if the Taxonomy is corrupted.
 138    * @throws IOException if another error occurred.
 139    */
 140   @Deprecated
 141   public LuceneTaxonomyReader(File parentDir, String name)
 142   throws CorruptIndexException, IOException {
 143     this(FSDirectory.open(new File(parentDir, name)));
 144     ourDirectory = indexReader.directory(); // remember to close the directory we opened
 145   }
 146
 147   /**
 148    * Open for reading a taxonomy stored in a subdirectory of a given
 149    * directory on the file system.
 150    * @param parentDir The parent directory of the taxonomy's directory.
 151    * @param name The name of the taxonomy, and the subdirectory holding it.
 152    * @throws CorruptIndexException if the Taxonomy is corrupted.
 153    * @throws IOException if another error occurred.
 154    */
 155   @Deprecated
 156   public LuceneTaxonomyReader(String parentDir, String name)
 157   throws CorruptIndexException, IOException {
 158     this(FSDirectory.open(new File(parentDir, name)));
 159     ourDirectory = indexReader.directory(); // rememebr to close the directory we opened
 160   }
 161
 162   /**
 163    * setCacheSize controls the maximum allowed size of each of the caches
 164    * used by {@link #getPath(int)} and {@link #getOrdinal(CategoryPath)}.
 165    * <P>
 166    * Currently, if the given size is smaller than the current size of
 167    * a cache, it will not shrink, and rather we be limited to its current
 168    * size.
 169    * @param size the new maximum cache size, in number of entries.
 170    */
 171   public void setCacheSize(int size) {
 172     synchronized(getCategoryCache) {
 173       getCategoryCache.setMaxSize(size);
 174     }
 175     synchronized(getOrdinalCache) {
 176       getOrdinalCache.setMaxSize(size);
 177     }
 178   }
 179
 180   /**
 181    * setDelimiter changes the character that the taxonomy uses in its
 182    * internal storage as a delimiter between category components. Do not
 183    * use this method unless you really know what you are doing.
 184    * <P>
 185    * If you do use this method, make sure you call it before any other
 186    * methods that actually queries the taxonomy. Moreover, make sure you
 187    * always pass the same delimiter for all LuceneTaxonomyWriter and
 188    * LuceneTaxonomyReader objects you create.
 189    */
 190   public void setDelimiter(char delimiter) {
 191     this.delimiter = delimiter;
 192   }
 193
 194   public int getOrdinal(CategoryPath categoryPath) throws IOException {
 195     if (categoryPath.length()==0) {
 196       return ROOT_ORDINAL;
 197     }
 198     String path = categoryPath.toString(delimiter);
 199
 200     // First try to find the answer in the LRU cache:
 201     synchronized(getOrdinalCache) {
 202       Integer res = getOrdinalCache.get(path);
 203       if (res!=null) {
 204         return res.intValue();
 205       }
 206     }
 207
 208     // If we're still here, we have a cache miss. We need to fetch the
 209     // value from disk, and then also put it in the cache:
 210     int ret = TaxonomyReader.INVALID_ORDINAL;
 211     try {
 212       indexReaderLock.readLock().lock();
 213       TermDocs docs = indexReader.termDocs(new Term(Consts.FULL, path));
 214       if (docs.next()) {
 215         ret = docs.doc();
 216       }
 217     } finally {
 218       indexReaderLock.readLock().unlock();
 219     }
 220
 221     // Put the new value in the cache. Note that it is possible that while
 222     // we were doing the above fetching (without the cache locked), some
 223     // other thread already added the same category to the cache. We do
 224     // not care about this possibilty, as LRUCache replaces previous values
 225     // of the same keys (it doesn't store duplicates).
 226     synchronized(getOrdinalCache) {
 227       // GB: new Integer(int); creates a new object each and every time.
 228       // Integer.valueOf(int) might not (See JavaDoc).
 229       getOrdinalCache.put(path, Integer.valueOf(ret));
 230     }
 231
 232     return ret;
 233   }
 234
 235   public CategoryPath getPath(int ordinal) throws CorruptIndexException, IOException {
 236     // TODO (Facet): Currently, the LRU cache we use (getCategoryCache) holds
 237     // strings with delimiters, not CategoryPath objects, so even if
 238     // we have a cache hit, we need to process the string and build a new
 239     // CategoryPath object every time. What is preventing us from putting
 240     // the actual CategoryPath object in the cache is the fact that these
 241     // objects are mutable. So we should create an immutable (read-only)
 242     // interface that CategoryPath implements, and this method should
 243     // return this interface, not the writable CategoryPath.
 244     String label = getLabel(ordinal);
 245     if (label==null) {
 246       return null;
 247     }
 248     return new CategoryPath(label, delimiter);
 249   }
 250
 251   public boolean getPath(int ordinal, CategoryPath result) throws CorruptIndexException, IOException {
 252     String label = getLabel(ordinal);
 253     if (label==null) {
 254       return false;
 255     }
 256     result.clear();
 257     result.add(label, delimiter);
 258     return true;
 259   }
 260
 261   private String getLabel(int catID) throws CorruptIndexException, IOException {
 262     // First try to find the answer in the LRU cache. It is very
 263     // unfortunate that we need to allocate an Integer object here -
 264     // it would have been better if we used a hash table specifically
 265     // designed for int keys...
 266     // GB: new Integer(int); creates a new object each and every time.
 267     // Integer.valueOf(int) might not (See JavaDoc).
 268     Integer catIDInteger = Integer.valueOf(catID);
 269
 270     synchronized(getCategoryCache) {
 271       String res = getCategoryCache.get(catIDInteger);
 272       if (res!=null) {
 273         return res;
 274       }
 275     }
 276
 277     // If we're still here, we have a cache miss. We need to fetch the
 278     // value from disk, and then also put it in the cache:
 279     String ret;
 280     try {
 281       indexReaderLock.readLock().lock();
 282       // The taxonomy API dictates that if we get an invalid category
 283       // ID, we should return null, If we don't check this here, we
 284       // can some sort of an exception from the document() call below.
 285       // NOTE: Currently, we *do not* cache this return value; There
 286       // isn't much point to do so, because checking the validity of
 287       // the docid doesn't require disk access - just comparing with
 288       // the number indexReader.maxDoc().
 289       if (catID<0 || catID>=indexReader.maxDoc()) {
 290         return null;
 291       }
 292       ret = indexReader.document(catID, Consts.fullPathSelector)
 293       .get(Consts.FULL);
 294     } finally {
 295       indexReaderLock.readLock().unlock();
 296     }
 297     // Put the new value in the cache. Note that it is possible that while
 298     // we were doing the above fetching (without the cache locked), some
 299     // other thread already added the same category to the cache. We do
 300     // not care about this possibility, as LRUCache replaces previous
 301     // values of the same keys (it doesn't store duplicates).
 302     synchronized (getCategoryCache) {
 303       getCategoryCache.put(catIDInteger, ret);
 304     }
 305
 306     return ret;
 307   }
 308
 309   public int getParent(int ordinal) {
 310     // Note how we don't need to hold the read lock to do the following,
 311     // because the array reference is volatile, ensuring the correct
 312     // visibility and ordering: if we get the new reference, the new
 313     // data is also visible to this thread.
 314     return getParentArray()[ordinal];
 315   }
 316
 317   /**
 318    * getParentArray() returns an int array of size getSize() listing the
 319    * ordinal of the parent category of each category in the taxonomy.
 320    * <P>
 321    * The caller can hold on to the array it got indefinitely - it is
 322    * guaranteed that no-one else will modify it. The other side of the
 323    * same coin is that the caller must treat the array it got as read-only
 324    * and <B>not modify it</B>, because other callers might have gotten the
 325    * same array too, and getParent() calls are also answered from the
 326    * same array.
 327    * <P>
 328    * The getParentArray() call is extremely efficient, merely returning
 329    * a reference to an array that already exists. For a caller that plans
 330    * to call getParent() for many categories, using getParentArray() and
 331    * the array it returns is a somewhat faster approach because it avoids
 332    * the overhead of method calls and volatile dereferencing.
 333    * <P>
 334    * If you use getParentArray() instead of getParent(), remember that
 335    * the array you got is (naturally) not modified after a refresh(),
 336    * so you should always call getParentArray() again after a refresh().
 337    */
 338
 339   public int[] getParentArray() {
 340     // Note how we don't need to hold the read lock to do the following,
 341     // because the array reference is volatile, ensuring the correct
 342     // visibility and ordering: if we get the new reference, the new
 343     // data is also visible to this thread.
 344     return parentArray.getArray();
 345   }
 346
 347   // Note that refresh() is synchronized (it is the only synchronized
 348   // method in this class) to ensure that it never gets called concurrently
 349   // with itself.
 350   public synchronized void refresh() throws IOException {
 351     /*
 352      * Since refresh() can be a lengthy operation, it is very important that we
 353      * avoid locking out all readers for its duration. This is why we don't hold
 354      * the indexReaderLock write lock for the entire duration of this method. In
 355      * fact, it is enough to hold it only during a single assignment! Other
 356      * comments in this method will explain this.
 357      */
 358
 359     // note that the lengthy operation indexReader.reopen() does not
 360     // modify the reader, so we can do it without holding a lock. We can
 361     // safely read indexReader without holding the write lock, because
 362     // no other thread can be writing at this time (this method is the
 363     // only possible writer, and it is "synchronized" to avoid this case).
 364     IndexReader r2 = indexReader.reopen();
 365     if (indexReader != r2) {
 366       IndexReader oldreader = indexReader;
 367       // we can close the old searcher, but need to synchronize this
 368       // so that we don't close it in the middle that another routine
 369       // is reading from it.
 370       indexReaderLock.writeLock().lock();
 371       indexReader = r2;
 372       indexReaderLock.writeLock().unlock();
 373       // We can close the old reader, but need to be certain that we
 374       // don't close it while another method is reading from it.
 375       // Luckily, we can be certain of that even without putting the
 376       // oldreader.close() in the locked section. The reason is that
 377       // after lock() succeeded above, we know that all existing readers
 378       // had finished (this is what a read-write lock ensures). New
 379       // readers, starting after the unlock() we just did, already got
 380       // the new indexReader we set above. So nobody can be possibly
 381       // using the old indexReader, and we can close it:
 382       oldreader.close();
 383
 384       // We prefetch some of the arrays to make requests much faster.
 385       // Let's refresh these prefetched arrays; This refresh is much
 386       // is made more efficient by assuming that it is enough to read
 387       // the values for new categories (old categories could not have been
 388       // changed or deleted)
 389       // Note that this this done without the write lock being held,
 390       // which means that it is possible that during a refresh(), a
 391       // reader will have some methods (like getOrdinal and getCategory)
 392       // return fresh information, while getParent()
 393       // (only to be prefetched now) still return older information.
 394       // We consider this to be acceptable. The important thing,
 395       // however, is that refreshPrefetchArrays() itself writes to
 396       // the arrays in a correct manner (see discussion there)
 397       parentArray.refresh(indexReader);
 398
 399       // Remove any INVALID_ORDINAL values from the ordinal cache,
 400       // because it is possible those are now answered by the new data!
 401       Iterator<Entry<String, Integer>> i = getOrdinalCache.entrySet().iterator();
 402       while (i.hasNext()) {
 403         Entry<String, Integer> e = i.next();
 404         if (e.getValue().intValue() == INVALID_ORDINAL) {
 405           i.remove();
 406         }
 407       }
 408     }
 409   }
 410
 411   public void close() throws IOException {
 412     indexReader.close();
 413     if (ourDirectory!=null) {
 414       ourDirectory.close();
 415     }
 416   }
 417
 418   public int getSize() {
 419     indexReaderLock.readLock().lock();
 420     try {
 421       return indexReader.numDocs();
 422     } finally {
 423       indexReaderLock.readLock().unlock();
 424     }
 425   }
 426
 427   public Map<String, String> getCommitUserData() {
 428     return indexReader.getCommitUserData();
 429   }
 430
 431   private ChildrenArrays childrenArrays;
 432   Object childrenArraysRebuild = new Object();
 433
 434   public ChildrenArrays getChildrenArrays() {
 435     // Check if the taxonomy grew since we built the array, and if it
 436     // did, create new (and larger) arrays and fill them as required.
 437     // We do all this under a lock, two prevent to concurrent calls to
 438     // needlessly do the same array building at the same time.
 439     synchronized(childrenArraysRebuild) {
 440       int num = getSize();
 441       int first;
 442       if (childrenArrays==null) {
 443         first = 0;
 444       } else {
 445         first = childrenArrays.getYoungestChildArray().length;
 446       }
 447       // If the taxonomy hasn't grown, we can return the existing object
 448       // immediately
 449       if (first == num) {
 450         return childrenArrays;
 451       }
 452       // Otherwise, build new arrays for a new ChildrenArray object.
 453       // These arrays start with an enlarged copy of the previous arrays,
 454       // and then are modified to take into account the new categories:
 455       int[] newYoungestChildArray = new int[num];
 456       int[] newOlderSiblingArray = new int[num];
 457       // In Java 6, we could just do Arrays.copyOf()...
 458       if (childrenArrays!=null) {
 459         System.arraycopy(childrenArrays.getYoungestChildArray(), 0,
 460             newYoungestChildArray, 0, childrenArrays.getYoungestChildArray().length);
 461         System.arraycopy(childrenArrays.getOlderSiblingArray(), 0,
 462             newOlderSiblingArray, 0, childrenArrays.getOlderSiblingArray().length);
 463       }
 464       int[] parents = getParentArray();
 465       for (int i=first; i<num; i++) {
 466         newYoungestChildArray[i] = INVALID_ORDINAL;
 467       }
 468       // In the loop below we can ignore the root category (0) because
 469       // it has no parent
 470       if (first==0) {
 471         first = 1;
 472         newOlderSiblingArray[0] = INVALID_ORDINAL;
 473       }
 474       for (int i=first; i<num; i++) {
 475         // Note that parents[i] is always < i, so the right-hand-side of
 476         // the following line is already set when we get here.
 477         newOlderSiblingArray[i] = newYoungestChildArray[parents[i]];
 478         newYoungestChildArray[parents[i]] = i;
 479       }
 480       // Finally switch to the new arrays
 481       childrenArrays = new ChildrenArraysImpl(newYoungestChildArray,
 482           newOlderSiblingArray);
 483       return childrenArrays;
 484     }
 485   }
 486
 487   public String toString(int max) {
 488     StringBuilder sb = new StringBuilder();
 489     int upperl = Math.min(max, this.indexReader.maxDoc());
 490     for (int i = 0; i < upperl; i++) {
 491       try {
 492         CategoryPath category = this.getPath(i);
 493         if (category == null) {
 494           sb.append(i + ": NULL!! \n");
 495           continue;
 496         }
 497         if (category.length() == 0) {
 498           sb.append(i + ": EMPTY STRING!! \n");
 499           continue;
 500         }
 501         sb.append(i +": "+category.toString()+"\n");
 502       } catch (IOException e) {
 503         if (logger.isLoggable(Level.FINEST)) {
 504           logger.log(Level.FINEST, e.getMessage(), e);
 505         }
 506       }
 507     }
 508     return sb.toString();
 509   }
 510
 511   private static final class ChildrenArraysImpl implements ChildrenArrays {
 512     private int[] youngestChildArray, olderSiblingArray;
 513     public ChildrenArraysImpl(int[] youngestChildArray, int[] olderSiblingArray) {
 514       this.youngestChildArray = youngestChildArray;
 515       this.olderSiblingArray = olderSiblingArray;
 516     }
 517     public int[] getOlderSiblingArray() {
 518       return olderSiblingArray;
 519     }
 520     public int[] getYoungestChildArray() {
 521       return youngestChildArray;
 522     }
 523   }
 524
 525   /**
 526    * Expert:  This method is only for expert use.
 527    * Note also that any call to refresh() will invalidate the returned reader,
 528    * so the caller needs to take care of appropriate locking.
 529    *
 530    * @return lucene indexReader
 531    */
 532   IndexReader getInternalIndexReader() {
 533     return this.indexReader;
 534   }
 535
 536   /**
 537    * Expert: decreases the refCount of this TaxonomyReader instance.
 538    * If the refCount drops to 0, then pending changes (if any) are
 539    * committed to the taxonomy index and this reader is closed.
 540    * @throws IOException
 541    */
 542   public void decRef() throws IOException {
 543     this.indexReader.decRef();
 544   }
 545
 546   /**
 547    * Expert: returns the current refCount for this taxonomy reader
 548    */
 549   public int getRefCount() {
 550     return this.indexReader.getRefCount();
 551   }
 552
 553   /**
 554    * Expert: increments the refCount of this TaxonomyReader instance.
 555    * RefCounts are used to determine when a taxonomy reader can be closed
 556    * safely, i.e. as soon as there are no more references.
 557    * Be sure to always call a corresponding decRef(), in a finally clause;
 558    * otherwise the reader may never be closed.
 559    */
 560   public void incRef() {
 561     this.indexReader.incRef();
 562   }
 563 }