1 package org.apache.lucene.facet.taxonomy.lucene;
4 import java.io.IOException;
5 import java.util.Iterator;
7 import java.util.Map.Entry;
8 import java.util.concurrent.locks.ReadWriteLock;
9 import java.util.concurrent.locks.ReentrantReadWriteLock;
10 import java.util.logging.Level;
11 import java.util.logging.Logger;
13 import org.apache.lucene.index.CorruptIndexException;
14 import org.apache.lucene.index.IndexReader;
15 import org.apache.lucene.index.Term;
16 import org.apache.lucene.index.TermDocs;
17 import org.apache.lucene.store.Directory;
18 import org.apache.lucene.store.FSDirectory;
20 import org.apache.lucene.facet.taxonomy.CategoryPath;
21 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
22 import org.apache.lucene.util.collections.LRUHashMap;
25 * Licensed to the Apache Software Foundation (ASF) under one or more
26 * contributor license agreements. See the NOTICE file distributed with
27 * this work for additional information regarding copyright ownership.
28 * The ASF licenses this file to You under the Apache License, Version 2.0
29 * (the "License"); you may not use this file except in compliance with
30 * the License. You may obtain a copy of the License at
32 * http://www.apache.org/licenses/LICENSE-2.0
34 * Unless required by applicable law or agreed to in writing, software
35 * distributed under the License is distributed on an "AS IS" BASIS,
36 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
37 * See the License for the specific language governing permissions and
38 * limitations under the License.
42 * LuceneTaxonomyReader is a {@link TaxonomyReader} which retrieves stored
43 * taxonomy information from a separate Lucene index. By using a Lucene index,
44 * rather than some specialized file format, we get for "free" its correctness
45 * (especially regarding concurrency), and the ability to save it on any
46 * implementation of Directory (and not just the file system).
48 * Reading from the on-disk index on every method call is too slow, so this
49 * implementation employs caching: Some methods cache recent requests and
50 * their results, while other methods prefetch all the data into memory
51 * and then provide answers directly from in-memory tables. See the
52 * documentation of individual methods for comments on their performance.
54 * @lucene.experimental
56 public class LuceneTaxonomyReader implements TaxonomyReader {
58 private static final Logger logger = Logger.getLogger(LuceneTaxonomyReader.class.getName());
60 private IndexReader indexReader;
62 // The following lock is used to allow multiple threads to read from the
63 // index concurrently, while having them block during the very short
64 // critical moment of refresh() (see comments below). Note, however, that
65 // we only read from the index when we don't have the entry in our cache,
66 // and the caches are locked separately.
67 private ReadWriteLock indexReaderLock = new ReentrantReadWriteLock();
69 // The following are the limited-size LRU caches used to cache the latest
70 // results from getOrdinal() and getCategoryCache().
71 // Because LRUHashMap is not thread-safe, we need to synchronize on this
72 // object when using it. Unfortunately, this is not optimal under heavy
73 // contention because it means that while one thread is using the cache
74 // (reading or modifying) others are blocked from using it - or even
75 // starting to do benign things like calculating the hash function. A more
76 // efficient approach would be to use a non-locking (as much as possible)
77 // concurrent solution, along the lines of java.util.concurrent.ConcurrentHashMap
78 // but with LRU semantics.
79 // However, even in the current sub-optimal implementation we do not make
80 // the mistake of locking out readers while waiting for disk in a cache
81 // miss - below, we do not hold cache lock while reading missing data from
83 private final LRUHashMap<String, Integer> getOrdinalCache;
84 private final LRUHashMap<Integer, String> getCategoryCache;
86 // getParent() needs to be extremely efficient, to the point that we need
87 // to fetch all the data in advance into memory, and answer these calls
88 // from memory. Currently we use a large integer array, which is
89 // initialized when the taxonomy is opened, and potentially enlarged
90 // when it is refresh()ed.
91 // These arrays are not syncrhonized. Rather, the reference to the array
92 // is volatile, and the only writing operation (refreshPrefetchArrays)
93 // simply creates a new array and replaces the reference. The volatility
94 // of the reference ensures the correct atomic replacement and its
95 // visibility properties (the content of the array is visible when the
96 // new reference is visible).
97 private ParentArray parentArray;
99 private char delimiter = Consts.DEFAULT_DELIMITER;
102 * Open for reading a taxonomy stored in a given {@link Directory}.
104 * The {@link Directory} in which to the taxonomy lives. Note that
105 * the taxonomy is read directly to that directory (not from a
106 * subdirectory of it).
107 * @throws CorruptIndexException if the Taxonomy is corrupted.
108 * @throws IOException if another error occurred.
110 public LuceneTaxonomyReader(Directory directory)
111 throws CorruptIndexException, IOException {
112 this.indexReader = openIndexReader(directory);
114 // These are the default cache sizes; they can be configured after
115 // construction with the cache's setMaxSize() method
116 getOrdinalCache = new LRUHashMap<String, Integer>(4000);
117 getCategoryCache = new LRUHashMap<Integer, String>(4000);
119 // TODO (Facet): consider lazily create parent array it when asked, not in the constructor
120 parentArray = new ParentArray();
121 parentArray.refresh(indexReader);
124 protected IndexReader openIndexReader(Directory directory) throws CorruptIndexException, IOException {
125 return IndexReader.open(directory);
128 // convenience constructors... deprecated because they cause confusion
129 // because they use parent directory instead of the actual directory.
130 private Directory ourDirectory = null; // remember directory to close later, but only if we opened it here
132 * Open for reading a taxonomy stored in a subdirectory of a given
133 * directory on the file system.
134 * @param parentDir The parent directory of the taxonomy's directory
135 * (usually this would be the directory holding the index).
136 * @param name The name of the taxonomy, and the subdirectory holding it.
137 * @throws CorruptIndexException if the Taxonomy is corrupted.
138 * @throws IOException if another error occurred.
141 public LuceneTaxonomyReader(File parentDir, String name)
142 throws CorruptIndexException, IOException {
143 this(FSDirectory.open(new File(parentDir, name)));
144 ourDirectory = indexReader.directory(); // remember to close the directory we opened
148 * Open for reading a taxonomy stored in a subdirectory of a given
149 * directory on the file system.
150 * @param parentDir The parent directory of the taxonomy's directory.
151 * @param name The name of the taxonomy, and the subdirectory holding it.
152 * @throws CorruptIndexException if the Taxonomy is corrupted.
153 * @throws IOException if another error occurred.
156 public LuceneTaxonomyReader(String parentDir, String name)
157 throws CorruptIndexException, IOException {
158 this(FSDirectory.open(new File(parentDir, name)));
159 ourDirectory = indexReader.directory(); // rememebr to close the directory we opened
163 * setCacheSize controls the maximum allowed size of each of the caches
164 * used by {@link #getPath(int)} and {@link #getOrdinal(CategoryPath)}.
166 * Currently, if the given size is smaller than the current size of
167 * a cache, it will not shrink, and rather we be limited to its current
169 * @param size the new maximum cache size, in number of entries.
171 public void setCacheSize(int size) {
172 synchronized(getCategoryCache) {
173 getCategoryCache.setMaxSize(size);
175 synchronized(getOrdinalCache) {
176 getOrdinalCache.setMaxSize(size);
181 * setDelimiter changes the character that the taxonomy uses in its
182 * internal storage as a delimiter between category components. Do not
183 * use this method unless you really know what you are doing.
185 * If you do use this method, make sure you call it before any other
186 * methods that actually queries the taxonomy. Moreover, make sure you
187 * always pass the same delimiter for all LuceneTaxonomyWriter and
188 * LuceneTaxonomyReader objects you create.
190 public void setDelimiter(char delimiter) {
191 this.delimiter = delimiter;
194 public int getOrdinal(CategoryPath categoryPath) throws IOException {
195 if (categoryPath.length()==0) {
198 String path = categoryPath.toString(delimiter);
200 // First try to find the answer in the LRU cache:
201 synchronized(getOrdinalCache) {
202 Integer res = getOrdinalCache.get(path);
204 return res.intValue();
208 // If we're still here, we have a cache miss. We need to fetch the
209 // value from disk, and then also put it in the cache:
210 int ret = TaxonomyReader.INVALID_ORDINAL;
212 indexReaderLock.readLock().lock();
213 TermDocs docs = indexReader.termDocs(new Term(Consts.FULL, path));
218 indexReaderLock.readLock().unlock();
221 // Put the new value in the cache. Note that it is possible that while
222 // we were doing the above fetching (without the cache locked), some
223 // other thread already added the same category to the cache. We do
224 // not care about this possibilty, as LRUCache replaces previous values
225 // of the same keys (it doesn't store duplicates).
226 synchronized(getOrdinalCache) {
227 // GB: new Integer(int); creates a new object each and every time.
228 // Integer.valueOf(int) might not (See JavaDoc).
229 getOrdinalCache.put(path, Integer.valueOf(ret));
235 public CategoryPath getPath(int ordinal) throws CorruptIndexException, IOException {
236 // TODO (Facet): Currently, the LRU cache we use (getCategoryCache) holds
237 // strings with delimiters, not CategoryPath objects, so even if
238 // we have a cache hit, we need to process the string and build a new
239 // CategoryPath object every time. What is preventing us from putting
240 // the actual CategoryPath object in the cache is the fact that these
241 // objects are mutable. So we should create an immutable (read-only)
242 // interface that CategoryPath implements, and this method should
243 // return this interface, not the writable CategoryPath.
244 String label = getLabel(ordinal);
248 return new CategoryPath(label, delimiter);
251 public boolean getPath(int ordinal, CategoryPath result) throws CorruptIndexException, IOException {
252 String label = getLabel(ordinal);
257 result.add(label, delimiter);
261 private String getLabel(int catID) throws CorruptIndexException, IOException {
262 // First try to find the answer in the LRU cache. It is very
263 // unfortunate that we need to allocate an Integer object here -
264 // it would have been better if we used a hash table specifically
265 // designed for int keys...
266 // GB: new Integer(int); creates a new object each and every time.
267 // Integer.valueOf(int) might not (See JavaDoc).
268 Integer catIDInteger = Integer.valueOf(catID);
270 synchronized(getCategoryCache) {
271 String res = getCategoryCache.get(catIDInteger);
277 // If we're still here, we have a cache miss. We need to fetch the
278 // value from disk, and then also put it in the cache:
281 indexReaderLock.readLock().lock();
282 // The taxonomy API dictates that if we get an invalid category
283 // ID, we should return null, If we don't check this here, we
284 // can some sort of an exception from the document() call below.
285 // NOTE: Currently, we *do not* cache this return value; There
286 // isn't much point to do so, because checking the validity of
287 // the docid doesn't require disk access - just comparing with
288 // the number indexReader.maxDoc().
289 if (catID<0 || catID>=indexReader.maxDoc()) {
292 ret = indexReader.document(catID, Consts.fullPathSelector)
295 indexReaderLock.readLock().unlock();
297 // Put the new value in the cache. Note that it is possible that while
298 // we were doing the above fetching (without the cache locked), some
299 // other thread already added the same category to the cache. We do
300 // not care about this possibility, as LRUCache replaces previous
301 // values of the same keys (it doesn't store duplicates).
302 synchronized (getCategoryCache) {
303 getCategoryCache.put(catIDInteger, ret);
309 public int getParent(int ordinal) {
310 // Note how we don't need to hold the read lock to do the following,
311 // because the array reference is volatile, ensuring the correct
312 // visibility and ordering: if we get the new reference, the new
313 // data is also visible to this thread.
314 return getParentArray()[ordinal];
318 * getParentArray() returns an int array of size getSize() listing the
319 * ordinal of the parent category of each category in the taxonomy.
321 * The caller can hold on to the array it got indefinitely - it is
322 * guaranteed that no-one else will modify it. The other side of the
323 * same coin is that the caller must treat the array it got as read-only
324 * and <B>not modify it</B>, because other callers might have gotten the
325 * same array too, and getParent() calls are also answered from the
328 * The getParentArray() call is extremely efficient, merely returning
329 * a reference to an array that already exists. For a caller that plans
330 * to call getParent() for many categories, using getParentArray() and
331 * the array it returns is a somewhat faster approach because it avoids
332 * the overhead of method calls and volatile dereferencing.
334 * If you use getParentArray() instead of getParent(), remember that
335 * the array you got is (naturally) not modified after a refresh(),
336 * so you should always call getParentArray() again after a refresh().
339 public int[] getParentArray() {
340 // Note how we don't need to hold the read lock to do the following,
341 // because the array reference is volatile, ensuring the correct
342 // visibility and ordering: if we get the new reference, the new
343 // data is also visible to this thread.
344 return parentArray.getArray();
347 // Note that refresh() is synchronized (it is the only synchronized
348 // method in this class) to ensure that it never gets called concurrently
350 public synchronized void refresh() throws IOException {
352 * Since refresh() can be a lengthy operation, it is very important that we
353 * avoid locking out all readers for its duration. This is why we don't hold
354 * the indexReaderLock write lock for the entire duration of this method. In
355 * fact, it is enough to hold it only during a single assignment! Other
356 * comments in this method will explain this.
359 // note that the lengthy operation indexReader.reopen() does not
360 // modify the reader, so we can do it without holding a lock. We can
361 // safely read indexReader without holding the write lock, because
362 // no other thread can be writing at this time (this method is the
363 // only possible writer, and it is "synchronized" to avoid this case).
364 IndexReader r2 = indexReader.reopen();
365 if (indexReader != r2) {
366 IndexReader oldreader = indexReader;
367 // we can close the old searcher, but need to synchronize this
368 // so that we don't close it in the middle that another routine
369 // is reading from it.
370 indexReaderLock.writeLock().lock();
372 indexReaderLock.writeLock().unlock();
373 // We can close the old reader, but need to be certain that we
374 // don't close it while another method is reading from it.
375 // Luckily, we can be certain of that even without putting the
376 // oldreader.close() in the locked section. The reason is that
377 // after lock() succeeded above, we know that all existing readers
378 // had finished (this is what a read-write lock ensures). New
379 // readers, starting after the unlock() we just did, already got
380 // the new indexReader we set above. So nobody can be possibly
381 // using the old indexReader, and we can close it:
384 // We prefetch some of the arrays to make requests much faster.
385 // Let's refresh these prefetched arrays; This refresh is much
386 // is made more efficient by assuming that it is enough to read
387 // the values for new categories (old categories could not have been
388 // changed or deleted)
389 // Note that this this done without the write lock being held,
390 // which means that it is possible that during a refresh(), a
391 // reader will have some methods (like getOrdinal and getCategory)
392 // return fresh information, while getParent()
393 // (only to be prefetched now) still return older information.
394 // We consider this to be acceptable. The important thing,
395 // however, is that refreshPrefetchArrays() itself writes to
396 // the arrays in a correct manner (see discussion there)
397 parentArray.refresh(indexReader);
399 // Remove any INVALID_ORDINAL values from the ordinal cache,
400 // because it is possible those are now answered by the new data!
401 Iterator<Entry<String, Integer>> i = getOrdinalCache.entrySet().iterator();
402 while (i.hasNext()) {
403 Entry<String, Integer> e = i.next();
404 if (e.getValue().intValue() == INVALID_ORDINAL) {
411 public void close() throws IOException {
413 if (ourDirectory!=null) {
414 ourDirectory.close();
418 public int getSize() {
419 indexReaderLock.readLock().lock();
421 return indexReader.numDocs();
423 indexReaderLock.readLock().unlock();
427 public Map<String, String> getCommitUserData() {
428 return indexReader.getCommitUserData();
431 private ChildrenArrays childrenArrays;
432 Object childrenArraysRebuild = new Object();
434 public ChildrenArrays getChildrenArrays() {
435 // Check if the taxonomy grew since we built the array, and if it
436 // did, create new (and larger) arrays and fill them as required.
437 // We do all this under a lock, two prevent to concurrent calls to
438 // needlessly do the same array building at the same time.
439 synchronized(childrenArraysRebuild) {
442 if (childrenArrays==null) {
445 first = childrenArrays.getYoungestChildArray().length;
447 // If the taxonomy hasn't grown, we can return the existing object
450 return childrenArrays;
452 // Otherwise, build new arrays for a new ChildrenArray object.
453 // These arrays start with an enlarged copy of the previous arrays,
454 // and then are modified to take into account the new categories:
455 int[] newYoungestChildArray = new int[num];
456 int[] newOlderSiblingArray = new int[num];
457 // In Java 6, we could just do Arrays.copyOf()...
458 if (childrenArrays!=null) {
459 System.arraycopy(childrenArrays.getYoungestChildArray(), 0,
460 newYoungestChildArray, 0, childrenArrays.getYoungestChildArray().length);
461 System.arraycopy(childrenArrays.getOlderSiblingArray(), 0,
462 newOlderSiblingArray, 0, childrenArrays.getOlderSiblingArray().length);
464 int[] parents = getParentArray();
465 for (int i=first; i<num; i++) {
466 newYoungestChildArray[i] = INVALID_ORDINAL;
468 // In the loop below we can ignore the root category (0) because
472 newOlderSiblingArray[0] = INVALID_ORDINAL;
474 for (int i=first; i<num; i++) {
475 // Note that parents[i] is always < i, so the right-hand-side of
476 // the following line is already set when we get here.
477 newOlderSiblingArray[i] = newYoungestChildArray[parents[i]];
478 newYoungestChildArray[parents[i]] = i;
480 // Finally switch to the new arrays
481 childrenArrays = new ChildrenArraysImpl(newYoungestChildArray,
482 newOlderSiblingArray);
483 return childrenArrays;
487 public String toString(int max) {
488 StringBuilder sb = new StringBuilder();
489 int upperl = Math.min(max, this.indexReader.maxDoc());
490 for (int i = 0; i < upperl; i++) {
492 CategoryPath category = this.getPath(i);
493 if (category == null) {
494 sb.append(i + ": NULL!! \n");
497 if (category.length() == 0) {
498 sb.append(i + ": EMPTY STRING!! \n");
501 sb.append(i +": "+category.toString()+"\n");
502 } catch (IOException e) {
503 if (logger.isLoggable(Level.FINEST)) {
504 logger.log(Level.FINEST, e.getMessage(), e);
508 return sb.toString();
511 private static final class ChildrenArraysImpl implements ChildrenArrays {
512 private int[] youngestChildArray, olderSiblingArray;
513 public ChildrenArraysImpl(int[] youngestChildArray, int[] olderSiblingArray) {
514 this.youngestChildArray = youngestChildArray;
515 this.olderSiblingArray = olderSiblingArray;
517 public int[] getOlderSiblingArray() {
518 return olderSiblingArray;
520 public int[] getYoungestChildArray() {
521 return youngestChildArray;
526 * Expert: This method is only for expert use.
527 * Note also that any call to refresh() will invalidate the returned reader,
528 * so the caller needs to take care of appropriate locking.
530 * @return lucene indexReader
532 IndexReader getInternalIndexReader() {
533 return this.indexReader;
537 * Expert: decreases the refCount of this TaxonomyReader instance.
538 * If the refCount drops to 0, then pending changes (if any) are
539 * committed to the taxonomy index and this reader is closed.
540 * @throws IOException
542 public void decRef() throws IOException {
543 this.indexReader.decRef();
547 * Expert: returns the current refCount for this taxonomy reader
549 public int getRefCount() {
550 return this.indexReader.getRefCount();
554 * Expert: increments the refCount of this TaxonomyReader instance.
555 * RefCounts are used to determine when a taxonomy reader can be closed
556 * safely, i.e. as soon as there are no more references.
557 * Be sure to always call a corresponding decRef(), in a finally clause;
558 * otherwise the reader may never be closed.
560 public void incRef() {
561 this.indexReader.incRef();