1 package org.apache.lucene.facet.taxonomy.directory;
3 import java.io.IOException;
4 import java.util.Iterator;
6 import java.util.Map.Entry;
7 import java.util.concurrent.locks.ReadWriteLock;
8 import java.util.concurrent.locks.ReentrantReadWriteLock;
9 import java.util.logging.Level;
10 import java.util.logging.Logger;
12 import org.apache.lucene.facet.taxonomy.CategoryPath;
13 import org.apache.lucene.facet.taxonomy.InconsistentTaxonomyException;
14 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
15 import org.apache.lucene.index.CorruptIndexException;
16 import org.apache.lucene.index.IndexReader;
17 import org.apache.lucene.index.Term;
18 import org.apache.lucene.index.TermDocs;
19 import org.apache.lucene.store.AlreadyClosedException;
20 import org.apache.lucene.store.Directory;
21 import org.apache.lucene.util.collections.LRUHashMap;
24 * Licensed to the Apache Software Foundation (ASF) under one or more
25 * contributor license agreements. See the NOTICE file distributed with
26 * this work for additional information regarding copyright ownership.
27 * The ASF licenses this file to You under the Apache License, Version 2.0
28 * (the "License"); you may not use this file except in compliance with
29 * the License. You may obtain a copy of the License at
31 * http://www.apache.org/licenses/LICENSE-2.0
33 * Unless required by applicable law or agreed to in writing, software
34 * distributed under the License is distributed on an "AS IS" BASIS,
35 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
36 * See the License for the specific language governing permissions and
37 * limitations under the License.
41 * A {@link TaxonomyReader} which retrieves stored taxonomy information from a
44 * Reading from the on-disk index on every method call is too slow, so this
45 * implementation employs caching: Some methods cache recent requests and their
46 * results, while other methods prefetch all the data into memory and then
47 * provide answers directly from in-memory tables. See the documentation of
48 * individual methods for comments on their performance.
50 * @lucene.experimental
52 public class DirectoryTaxonomyReader implements TaxonomyReader {
54 private static final Logger logger = Logger.getLogger(DirectoryTaxonomyReader.class.getName());
56 private IndexReader indexReader;
58 // The following lock is used to allow multiple threads to read from the
59 // index concurrently, while having them block during the very short
60 // critical moment of refresh() (see comments below). Note, however, that
61 // we only read from the index when we don't have the entry in our cache,
62 // and the caches are locked separately.
63 private ReadWriteLock indexReaderLock = new ReentrantReadWriteLock();
65 // The following are the limited-size LRU caches used to cache the latest
66 // results from getOrdinal() and getLabel().
67 // Because LRUHashMap is not thread-safe, we need to synchronize on this
68 // object when using it. Unfortunately, this is not optimal under heavy
69 // contention because it means that while one thread is using the cache
70 // (reading or modifying) others are blocked from using it - or even
71 // starting to do benign things like calculating the hash function. A more
72 // efficient approach would be to use a non-locking (as much as possible)
73 // concurrent solution, along the lines of java.util.concurrent.ConcurrentHashMap
74 // but with LRU semantics.
75 // However, even in the current sub-optimal implementation we do not make
76 // the mistake of locking out readers while waiting for disk in a cache
77 // miss - below, we do not hold cache lock while reading missing data from
79 private final LRUHashMap<String, Integer> ordinalCache;
80 private final LRUHashMap<Integer, String> categoryCache;
82 // getParent() needs to be extremely efficient, to the point that we need
83 // to fetch all the data in advance into memory, and answer these calls
84 // from memory. Currently we use a large integer array, which is
85 // initialized when the taxonomy is opened, and potentially enlarged
86 // when it is refresh()ed.
87 // These arrays are not syncrhonized. Rather, the reference to the array
88 // is volatile, and the only writing operation (refreshPrefetchArrays)
89 // simply creates a new array and replaces the reference. The volatility
90 // of the reference ensures the correct atomic replacement and its
91 // visibility properties (the content of the array is visible when the
92 // new reference is visible).
93 private ParentArray parentArray;
95 private char delimiter = Consts.DEFAULT_DELIMITER;
97 private volatile boolean closed = false;
100 * Open for reading a taxonomy stored in a given {@link Directory}.
102 * The {@link Directory} in which to the taxonomy lives. Note that
103 * the taxonomy is read directly to that directory (not from a
104 * subdirectory of it).
105 * @throws CorruptIndexException if the Taxonomy is corrupted.
106 * @throws IOException if another error occurred.
108 public DirectoryTaxonomyReader(Directory directory) throws IOException {
109 this.indexReader = openIndexReader(directory);
111 // These are the default cache sizes; they can be configured after
112 // construction with the cache's setMaxSize() method
113 ordinalCache = new LRUHashMap<String, Integer>(4000);
114 categoryCache = new LRUHashMap<Integer, String>(4000);
116 // TODO (Facet): consider lazily create parent array when asked, not in the constructor
117 parentArray = new ParentArray();
118 parentArray.refresh(indexReader);
121 protected IndexReader openIndexReader(Directory directory) throws CorruptIndexException, IOException {
122 return IndexReader.open(directory);
126 * @throws AlreadyClosedException if this IndexReader is closed
128 protected final void ensureOpen() throws AlreadyClosedException {
129 if (indexReader.getRefCount() <= 0) {
130 throw new AlreadyClosedException("this TaxonomyReader is closed");
135 * setCacheSize controls the maximum allowed size of each of the caches
136 * used by {@link #getPath(int)} and {@link #getOrdinal(CategoryPath)}.
138 * Currently, if the given size is smaller than the current size of
139 * a cache, it will not shrink, and rather we be limited to its current
141 * @param size the new maximum cache size, in number of entries.
143 public void setCacheSize(int size) {
145 synchronized(categoryCache) {
146 categoryCache.setMaxSize(size);
148 synchronized(ordinalCache) {
149 ordinalCache.setMaxSize(size);
154 * setDelimiter changes the character that the taxonomy uses in its
155 * internal storage as a delimiter between category components. Do not
156 * use this method unless you really know what you are doing.
158 * If you do use this method, make sure you call it before any other
159 * methods that actually queries the taxonomy. Moreover, make sure you
160 * always pass the same delimiter for all LuceneTaxonomyWriter and
161 * LuceneTaxonomyReader objects you create.
163 public void setDelimiter(char delimiter) {
165 this.delimiter = delimiter;
168 public int getOrdinal(CategoryPath categoryPath) throws IOException {
170 if (categoryPath.length()==0) {
173 String path = categoryPath.toString(delimiter);
175 // First try to find the answer in the LRU cache:
176 synchronized(ordinalCache) {
177 Integer res = ordinalCache.get(path);
179 return res.intValue();
183 // If we're still here, we have a cache miss. We need to fetch the
184 // value from disk, and then also put it in the cache:
185 int ret = TaxonomyReader.INVALID_ORDINAL;
187 indexReaderLock.readLock().lock();
188 TermDocs docs = indexReader.termDocs(new Term(Consts.FULL, path));
193 indexReaderLock.readLock().unlock();
196 // Put the new value in the cache. Note that it is possible that while
197 // we were doing the above fetching (without the cache locked), some
198 // other thread already added the same category to the cache. We do
199 // not care about this possibilty, as LRUCache replaces previous values
200 // of the same keys (it doesn't store duplicates).
201 synchronized(ordinalCache) {
202 // GB: new Integer(int); creates a new object each and every time.
203 // Integer.valueOf(int) might not (See JavaDoc).
204 ordinalCache.put(path, Integer.valueOf(ret));
210 public CategoryPath getPath(int ordinal) throws CorruptIndexException, IOException {
212 // TODO (Facet): Currently, the LRU cache we use (categoryCache) holds
213 // strings with delimiters, not CategoryPath objects, so even if
214 // we have a cache hit, we need to process the string and build a new
215 // CategoryPath object every time. What is preventing us from putting
216 // the actual CategoryPath object in the cache is the fact that these
217 // objects are mutable. So we should create an immutable (read-only)
218 // interface that CategoryPath implements, and this method should
219 // return this interface, not the writable CategoryPath.
220 String label = getLabel(ordinal);
224 return new CategoryPath(label, delimiter);
227 public boolean getPath(int ordinal, CategoryPath result) throws CorruptIndexException, IOException {
229 String label = getLabel(ordinal);
234 result.add(label, delimiter);
238 private String getLabel(int catID) throws CorruptIndexException, IOException {
240 // First try to find the answer in the LRU cache. It is very
241 // unfortunate that we need to allocate an Integer object here -
242 // it would have been better if we used a hash table specifically
243 // designed for int keys...
244 // GB: new Integer(int); creates a new object each and every time.
245 // Integer.valueOf(int) might not (See JavaDoc).
246 Integer catIDInteger = Integer.valueOf(catID);
248 synchronized(categoryCache) {
249 String res = categoryCache.get(catIDInteger);
255 // If we're still here, we have a cache miss. We need to fetch the
256 // value from disk, and then also put it in the cache:
259 indexReaderLock.readLock().lock();
260 // The taxonomy API dictates that if we get an invalid category
261 // ID, we should return null, If we don't check this here, we
262 // can some sort of an exception from the document() call below.
263 // NOTE: Currently, we *do not* cache this return value; There
264 // isn't much point to do so, because checking the validity of
265 // the docid doesn't require disk access - just comparing with
266 // the number indexReader.maxDoc().
267 if (catID<0 || catID>=indexReader.maxDoc()) {
270 ret = indexReader.document(catID, Consts.fullPathSelector)
273 indexReaderLock.readLock().unlock();
275 // Put the new value in the cache. Note that it is possible that while
276 // we were doing the above fetching (without the cache locked), some
277 // other thread already added the same category to the cache. We do
278 // not care about this possibility, as LRUCache replaces previous
279 // values of the same keys (it doesn't store duplicates).
280 synchronized (categoryCache) {
281 categoryCache.put(catIDInteger, ret);
287 public int getParent(int ordinal) {
289 // Note how we don't need to hold the read lock to do the following,
290 // because the array reference is volatile, ensuring the correct
291 // visibility and ordering: if we get the new reference, the new
292 // data is also visible to this thread.
293 return getParentArray()[ordinal];
297 * getParentArray() returns an int array of size getSize() listing the
298 * ordinal of the parent category of each category in the taxonomy.
300 * The caller can hold on to the array it got indefinitely - it is
301 * guaranteed that no-one else will modify it. The other side of the
302 * same coin is that the caller must treat the array it got as read-only
303 * and <B>not modify it</B>, because other callers might have gotten the
304 * same array too, and getParent() calls are also answered from the
307 * The getParentArray() call is extremely efficient, merely returning
308 * a reference to an array that already exists. For a caller that plans
309 * to call getParent() for many categories, using getParentArray() and
310 * the array it returns is a somewhat faster approach because it avoids
311 * the overhead of method calls and volatile dereferencing.
313 * If you use getParentArray() instead of getParent(), remember that
314 * the array you got is (naturally) not modified after a refresh(),
315 * so you should always call getParentArray() again after a refresh().
318 public int[] getParentArray() {
320 // Note how we don't need to hold the read lock to do the following,
321 // because the array reference is volatile, ensuring the correct
322 // visibility and ordering: if we get the new reference, the new
323 // data is also visible to this thread.
324 return parentArray.getArray();
327 // Note that refresh() is synchronized (it is the only synchronized
328 // method in this class) to ensure that it never gets called concurrently
330 public synchronized boolean refresh() throws IOException, InconsistentTaxonomyException {
333 * Since refresh() can be a lengthy operation, it is very important that we
334 * avoid locking out all readers for its duration. This is why we don't hold
335 * the indexReaderLock write lock for the entire duration of this method. In
336 * fact, it is enough to hold it only during a single assignment! Other
337 * comments in this method will explain this.
340 // note that the lengthy operation indexReader.reopen() does not
341 // modify the reader, so we can do it without holding a lock. We can
342 // safely read indexReader without holding the write lock, because
343 // no other thread can be writing at this time (this method is the
344 // only possible writer, and it is "synchronized" to avoid this case).
345 IndexReader r2 = IndexReader.openIfChanged(indexReader);
347 return false; // no changes, nothing to do
350 // validate that a refresh is valid at this point, i.e. that the taxonomy
351 // was not recreated since this reader was last opened or refresshed.
352 String t1 = indexReader.getCommitUserData().get(DirectoryTaxonomyWriter.INDEX_CREATE_TIME);
353 String t2 = r2.getCommitUserData().get(DirectoryTaxonomyWriter.INDEX_CREATE_TIME);
357 throw new InconsistentTaxonomyException("Taxonomy was recreated at: "+t2);
359 } else if (!t1.equals(t2)) {
361 throw new InconsistentTaxonomyException("Taxonomy was recreated at: "+t2+" != "+t1);
364 IndexReader oldreader = indexReader;
365 // we can close the old searcher, but need to synchronize this
366 // so that we don't close it in the middle that another routine
367 // is reading from it.
368 indexReaderLock.writeLock().lock();
370 indexReaderLock.writeLock().unlock();
371 // We can close the old reader, but need to be certain that we
372 // don't close it while another method is reading from it.
373 // Luckily, we can be certain of that even without putting the
374 // oldreader.close() in the locked section. The reason is that
375 // after lock() succeeded above, we know that all existing readers
376 // had finished (this is what a read-write lock ensures). New
377 // readers, starting after the unlock() we just did, already got
378 // the new indexReader we set above. So nobody can be possibly
379 // using the old indexReader, and we can close it:
382 // We prefetch some of the arrays to make requests much faster.
383 // Let's refresh these prefetched arrays; This refresh is much
384 // is made more efficient by assuming that it is enough to read
385 // the values for new categories (old categories could not have been
386 // changed or deleted)
387 // Note that this this done without the write lock being held,
388 // which means that it is possible that during a refresh(), a
389 // reader will have some methods (like getOrdinal and getCategory)
390 // return fresh information, while getParent()
391 // (only to be prefetched now) still return older information.
392 // We consider this to be acceptable. The important thing,
393 // however, is that refreshPrefetchArrays() itself writes to
394 // the arrays in a correct manner (see discussion there)
395 parentArray.refresh(indexReader);
397 // Remove any INVALID_ORDINAL values from the ordinal cache,
398 // because it is possible those are now answered by the new data!
399 Iterator<Entry<String, Integer>> i = ordinalCache.entrySet().iterator();
400 while (i.hasNext()) {
401 Entry<String, Integer> e = i.next();
402 if (e.getValue().intValue() == INVALID_ORDINAL) {
409 public void close() throws IOException {
416 /** Do the actual closing, free up resources */
417 private void doClose() throws IOException {
422 childrenArrays = null;
423 categoryCache.clear();
424 ordinalCache.clear();
427 public int getSize() {
429 indexReaderLock.readLock().lock();
431 return indexReader.numDocs();
433 indexReaderLock.readLock().unlock();
437 public Map<String, String> getCommitUserData() {
439 return indexReader.getCommitUserData();
442 private ChildrenArrays childrenArrays;
443 Object childrenArraysRebuild = new Object();
445 public ChildrenArrays getChildrenArrays() {
447 // Check if the taxonomy grew since we built the array, and if it
448 // did, create new (and larger) arrays and fill them as required.
449 // We do all this under a lock, two prevent to concurrent calls to
450 // needlessly do the same array building at the same time.
451 synchronized(childrenArraysRebuild) {
454 if (childrenArrays==null) {
457 first = childrenArrays.getYoungestChildArray().length;
459 // If the taxonomy hasn't grown, we can return the existing object
462 return childrenArrays;
464 // Otherwise, build new arrays for a new ChildrenArray object.
465 // These arrays start with an enlarged copy of the previous arrays,
466 // and then are modified to take into account the new categories:
467 int[] newYoungestChildArray = new int[num];
468 int[] newOlderSiblingArray = new int[num];
469 // In Java 6, we could just do Arrays.copyOf()...
470 if (childrenArrays!=null) {
471 System.arraycopy(childrenArrays.getYoungestChildArray(), 0,
472 newYoungestChildArray, 0, childrenArrays.getYoungestChildArray().length);
473 System.arraycopy(childrenArrays.getOlderSiblingArray(), 0,
474 newOlderSiblingArray, 0, childrenArrays.getOlderSiblingArray().length);
476 int[] parents = getParentArray();
477 for (int i=first; i<num; i++) {
478 newYoungestChildArray[i] = INVALID_ORDINAL;
480 // In the loop below we can ignore the root category (0) because
484 newOlderSiblingArray[0] = INVALID_ORDINAL;
486 for (int i=first; i<num; i++) {
487 // Note that parents[i] is always < i, so the right-hand-side of
488 // the following line is already set when we get here.
489 newOlderSiblingArray[i] = newYoungestChildArray[parents[i]];
490 newYoungestChildArray[parents[i]] = i;
492 // Finally switch to the new arrays
493 childrenArrays = new ChildrenArraysImpl(newYoungestChildArray,
494 newOlderSiblingArray);
495 return childrenArrays;
499 public String toString(int max) {
501 StringBuilder sb = new StringBuilder();
502 int upperl = Math.min(max, this.indexReader.maxDoc());
503 for (int i = 0; i < upperl; i++) {
505 CategoryPath category = this.getPath(i);
506 if (category == null) {
507 sb.append(i + ": NULL!! \n");
510 if (category.length() == 0) {
511 sb.append(i + ": EMPTY STRING!! \n");
514 sb.append(i +": "+category.toString()+"\n");
515 } catch (IOException e) {
516 if (logger.isLoggable(Level.FINEST)) {
517 logger.log(Level.FINEST, e.getMessage(), e);
521 return sb.toString();
524 private static final class ChildrenArraysImpl implements ChildrenArrays {
525 private int[] youngestChildArray, olderSiblingArray;
526 public ChildrenArraysImpl(int[] youngestChildArray, int[] olderSiblingArray) {
527 this.youngestChildArray = youngestChildArray;
528 this.olderSiblingArray = olderSiblingArray;
530 public int[] getOlderSiblingArray() {
531 return olderSiblingArray;
533 public int[] getYoungestChildArray() {
534 return youngestChildArray;
539 * Expert: This method is only for expert use.
540 * Note also that any call to refresh() will invalidate the returned reader,
541 * so the caller needs to take care of appropriate locking.
543 * @return lucene indexReader
545 IndexReader getInternalIndexReader() {
547 return this.indexReader;
551 * Expert: decreases the refCount of this TaxonomyReader instance.
552 * If the refCount drops to 0, then pending changes (if any) are
553 * committed to the taxonomy index and this reader is closed.
554 * @throws IOException
556 public void decRef() throws IOException {
558 if (indexReader.getRefCount() == 1) {
559 // Do not decRef the indexReader - doClose does it by calling reader.close()
562 indexReader.decRef();
567 * Expert: returns the current refCount for this taxonomy reader
569 public int getRefCount() {
571 return this.indexReader.getRefCount();
575 * Expert: increments the refCount of this TaxonomyReader instance.
576 * RefCounts are used to determine when a taxonomy reader can be closed
577 * safely, i.e. as soon as there are no more references.
578 * Be sure to always call a corresponding decRef(), in a finally clause;
579 * otherwise the reader may never be closed.
581 public void incRef() {
583 this.indexReader.incRef();