lucene-java-3.5.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java

   1 package org.apache.lucene.facet.taxonomy;
   2
   3 import java.io.Closeable;
   4 import java.io.IOException;
   5 import java.util.Map;
   6
   7 /**
   8  * Licensed to the Apache Software Foundation (ASF) under one or more
   9  * contributor license agreements.  See the NOTICE file distributed with
  10  * this work for additional information regarding copyright ownership.
  11  * The ASF licenses this file to You under the Apache License, Version 2.0
  12  * (the "License"); you may not use this file except in compliance with
  13  * the License.  You may obtain a copy of the License at
  14  *
  15  *     http://www.apache.org/licenses/LICENSE-2.0
  16  *
  17  * Unless required by applicable law or agreed to in writing, software
  18  * distributed under the License is distributed on an "AS IS" BASIS,
  19  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  20  * See the License for the specific language governing permissions and
  21  * limitations under the License.
  22  */
  23
  24 /**
  25  * TaxonomyReader is the read-only interface with which the faceted-search
  26  * library uses the taxonomy during search time.
  27  * <P>
  28  * A TaxonomyReader holds a list of categories. Each category has a serial
  29  * number which we call an "ordinal", and a hierarchical "path" name:
  30  * <UL>
  31  * <LI>
  32  * The ordinal is an integer that starts at 0 for the first category (which is
  33  * always the root category), and grows contiguously as more categories are
  34  * added; Note that once a category is added, it can never be deleted.
  35  * <LI>
  36  * The path is a CategoryPath object specifying the category's position in the
  37  * hierarchy.
  38  * </UL>
  39  * <B>Notes about concurrent access to the taxonomy:</B>
  40  * <P>
  41  * An implementation must allow multiple readers to be active concurrently
  42  * with a single writer. Readers follow so-called "point in time" semantics,
  43  * i.e., a TaxonomyReader object will only see taxonomy entries which were
  44  * available at the time it was created. What the writer writes is only
  45  * available to (new) readers after the writer's commit() is called.
  46  * <P>
  47  * In faceted search, two separate indices are used: the main Lucene index,
  48  * and the taxonomy. Because the main index refers to the categories listed
  49  * in the taxonomy, it is important to open the taxonomy *after* opening the
  50  * main index, and it is also necessary to reopen() the taxonomy after
  51  * reopen()ing the main index.
  52  * <P>
  53  * This order is important, otherwise it would be possible for the main index
  54  * to refer to a category which is not yet visible in the old snapshot of
  55  * the taxonomy. Note that it is indeed fine for the the taxonomy to be opened
  56  * after the main index - even a long time after. The reason is that once
  57  * a category is added to the taxonomy, it can never be changed or deleted,
  58  * so there is no danger that a "too new" taxonomy not being consistent with
  59  * an older index.
  60  *
  61  * @lucene.experimental
  62  */
  63 public interface TaxonomyReader extends Closeable {
  64
  65   /**
  66    * The root category (the category with the empty path) always has the
  67    * ordinal 0, to which we give a name ROOT_ORDINAL.
  68    * getOrdinal() of an empty path will always return ROOT_ORDINAL, and
  69    * getCategory(ROOT_ORDINAL) will return the empty path.
  70    */
  71   public final static int ROOT_ORDINAL = 0;
  72
  73   /**
  74    * Ordinals are always non-negative, so a negative ordinal can be used to
  75    * signify an error. Methods here return INVALID_ORDINAL (-1) in this case.
  76    */
  77   public final static int INVALID_ORDINAL = -1;
  78
  79   /**
  80    * getOrdinal() returns the ordinal of the category given as a path.
  81    * The ordinal is the category's serial number, an integer which starts
  82    * with 0 and grows as more categories are added (note that once a category
  83    * is added, it can never be deleted).
  84    * <P>
  85    * If the given category wasn't found in the taxonomy, INVALID_ORDINAL is
  86    * returned.
  87    */
  88   public int getOrdinal(CategoryPath categoryPath) throws IOException;
  89
  90   /**
  91    * getPath() returns the path name of the category with the given
  92    * ordinal. The path is returned as a new CategoryPath object - to
  93    * reuse an existing object, use {@link #getPath(int, CategoryPath)}.
  94    * <P>
  95    * A null is returned if a category with the given ordinal does not exist.
  96    */
  97   public CategoryPath getPath(int ordinal) throws IOException;
  98
  99   /**
 100    * getPath() returns the path name of the category with the given
 101    * ordinal. The path is written to the given CategoryPath object (which
 102    * is cleared first).
 103    * <P>
 104    * If a category with the given ordinal does not exist, the given
 105    * CategoryPath object is not modified, and the method returns
 106    * <code>false</code>. Otherwise, the method returns <code>true</code>.
 107    */
 108   public boolean getPath(int ordinal, CategoryPath result) throws IOException;
 109
 110   /**
 111    * refresh() re-reads the taxonomy information if there were any changes to
 112    * the taxonomy since this instance was opened or last refreshed. Calling
 113    * refresh() is more efficient than close()ing the old instance and opening a
 114    * new one.
 115    * <P>
 116    * If there were no changes since this instance was opened or last refreshed,
 117    * then this call does nothing. Note, however, that this is still a relatively
 118    * slow method (as it needs to verify whether there have been any changes on
 119    * disk to the taxonomy), so it should not be called too often needlessly. In
 120    * faceted search, the taxonomy reader's refresh() should be called only after
 121    * a reopen() of the main index.
 122    * <P>
 123    * Refreshing the taxonomy might fail in some cases, for example
 124    * if the taxonomy was recreated since this instance was opened or last refreshed.
 125    * In this case an {@link InconsistentTaxonomyException} is thrown,
 126    * suggesting that in order to obtain up-to-date taxonomy data a new
 127    * {@link TaxonomyReader} should be opened. Note: This {@link TaxonomyReader}
 128    * instance remains unchanged and usable in this case, and the application can
 129    * continue to use it, and should still {@link #close()} when no longer needed.
 130    * <P>
 131    * It should be noted that refresh() is similar in purpose to
 132    * IndexReader.reopen(), but the two methods behave differently. refresh()
 133    * refreshes the existing TaxonomyReader object, rather than opening a new one
 134    * in addition to the old one as reopen() does. The reason is that in a
 135    * taxonomy, one can only add new categories and cannot modify or delete
 136    * existing categories; Therefore, there is no reason to keep an old snapshot
 137    * of the taxonomy open - refreshing the taxonomy to the newest data and using
 138    * this new snapshots in all threads (whether new or old) is fine. This saves
 139    * us needing to keep multiple copies of the taxonomy open in memory.
 140    * @return true if anything has changed, false otherwise.
 141    */
 142   public boolean refresh() throws IOException, InconsistentTaxonomyException;
 143
 144   /**
 145    * getParent() returns the ordinal of the parent category of the category
 146    * with the given ordinal.
 147    * <P>
 148    * When a category is specified as a path name, finding the path of its
 149    * parent is as trivial as dropping the last component of the path.
 150    * getParent() is functionally equivalent to calling getPath() on the
 151    * given ordinal, dropping the last component of the path, and then calling
 152    * getOrdinal() to get an ordinal back. However, implementations are
 153    * expected to provide a much more efficient implementation:
 154    * <P>
 155    * getParent() should be a very quick method, as it is used during the
 156    * facet aggregation process in faceted search. Implementations will most
 157    * likely want to serve replies to this method from a pre-filled cache.
 158    * <P>
 159    * If the given ordinal is the ROOT_ORDINAL, an INVALID_ORDINAL is returned.
 160    * If the given ordinal is a top-level category, the ROOT_ORDINAL is returned.
 161    * If an invalid ordinal is given (negative or beyond the last available
 162    * ordinal), an ArrayIndexOutOfBoundsException is thrown. However, it is
 163    * expected that getParent will only be called for ordinals which are
 164    * already known to be in the taxonomy.
 165    */
 166   public int getParent(int ordinal) throws IOException;
 167
 168   /**
 169    * getParentArray() returns an int array of size getSize() listing the
 170    * ordinal of the parent category of each category in the taxonomy.
 171    * <P>
 172    * The caller can hold on to the array it got indefinitely - it is
 173    * guaranteed that no-one else will modify it. The other side of the
 174    * same coin is that the caller must treat the array it got as read-only
 175    * and <B>not modify it</B>, because other callers might have gotten the
 176    * same array too (and getParent() calls might be answered from the
 177    * same array).
 178    * <P>
 179    * If you use getParentArray() instead of getParent(), remember that
 180    * the array you got is (naturally) not modified after a refresh(),
 181    * so you should always call getParentArray() again after a refresh().
 182    * <P>
 183    * This method's function is similar to allocating an array of size
 184    * getSize() and filling it with getParent() calls, but implementations
 185    * are encouraged to implement it much more efficiently, with O(1)
 186    * complexity. This can be done, for example, by the implementation
 187    * already keeping the parents in an array, and just returning this
 188    * array (without any allocation or copying) when requested.
 189    */
 190   public int[] getParentArray() throws IOException;
 191
 192   /**
 193    * Equivalent representations of the taxonomy's parent info,
 194    * used internally for efficient computation of facet results:
 195    * "youngest child" and "oldest sibling"
 196    */
 197   public static interface ChildrenArrays {
 198     /**
 199      * getYoungestChildArray() returns an int array of size getSize()
 200      * listing the ordinal of the youngest (highest numbered) child
 201      * category of each category in the taxonomy. The value for a leaf
 202      * category (a category without children) is
 203      * <code>INVALID_ORDINAL</code>.
 204      */
 205     public int[] getYoungestChildArray();
 206     /**
 207      * getOlderSiblingArray() returns an int array of size getSize()
 208      * listing for each category the ordinal of its immediate older
 209      * sibling (the sibling in the taxonomy tree with the highest ordinal
 210      * below that of the given ordinal). The value for a category with no
 211      * older sibling is <code>INVALID_ORDINAL</code>.
 212      */
 213     public int[] getOlderSiblingArray();
 214   }
 215
 216   /**
 217    * getChildrenArrays() returns a {@link ChildrenArrays} object which can
 218    * be used together to efficiently enumerate the children of any category.
 219    * <P>
 220    * The caller can hold on to the object it got indefinitely - it is
 221    * guaranteed that no-one else will modify it. The other side of the
 222    * same coin is that the caller must treat the object which it got (and
 223    * the arrays it contains) as read-only and <B>not modify it</B>, because
 224    * other callers might have gotten the same object too.
 225    * <P>
 226    * Implementations should have O(getSize()) time for the first call or
 227    * after a refresh(), but O(1) time for further calls. In neither case
 228    * there should be a need to read new data from disk. These guarantees
 229    * are most likely achieved by calculating this object (based on the
 230    * getParentArray()) when first needed, and later (if the taxonomy was not
 231    * refreshed) returning the same object (without any allocation or copying)
 232    * when requested.
 233    * <P>
 234    * The reason we have one method returning one object, rather than two
 235    * methods returning two arrays, is to avoid race conditions in a multi-
 236    * threaded application: We want to avoid the possibility of returning one
 237    * new array and one old array, as those could not be used together.
 238    */
 239   public ChildrenArrays getChildrenArrays();
 240
 241   /**
 242    * Retrieve user committed data.
 243    * @see TaxonomyWriter#commit(Map)
 244    */
 245   public Map<String, String> getCommitUserData();
 246
 247   /**
 248    * Expert: increments the refCount of this TaxonomyReader instance.
 249    * RefCounts can be used to determine when a taxonomy reader can be closed
 250    * safely, i.e. as soon as there are no more references.
 251    * Be sure to always call a corresponding decRef(), in a finally clause;
 252    * otherwise the reader may never be closed.
 253    */
 254   public void incRef();
 255
 256   /**
 257    * Expert: decreases the refCount of this TaxonomyReader instance.
 258    * If the refCount drops to 0, then pending changes (if any) can be
 259    * committed to the taxonomy index and this reader can be closed.
 260    * @throws IOException
 261    */
 262   public void decRef() throws IOException;
 263
 264   /**
 265    * Expert: returns the current refCount for this taxonomy reader
 266    */
 267   public int getRefCount();
 268
 269   /**
 270    * getSize() returns the number of categories in the taxonomy.
 271    * <P>
 272    * Because categories are numbered consecutively starting with 0, it
 273    * means the taxonomy contains ordinals 0 through getSize()-1.
 274    * <P>
 275    * Note that the number returned by getSize() is often slightly higher
 276    * than the number of categories inserted into the taxonomy; This is
 277    * because when a category is added to the taxonomy, its ancestors
 278    * are also added automatically (including the root, which always get
 279    * ordinal 0).
 280    */
 281   public int getSize();
 282
 283 }