1 package org.apache.lucene.facet.taxonomy.directory;
3 import java.io.BufferedInputStream;
4 import java.io.BufferedOutputStream;
5 import java.io.DataInputStream;
6 import java.io.DataOutputStream;
8 import java.io.FileInputStream;
9 import java.io.FileNotFoundException;
10 import java.io.FileOutputStream;
11 import java.io.IOException;
12 import java.util.HashMap;
15 import org.apache.lucene.analysis.KeywordAnalyzer;
16 import org.apache.lucene.analysis.TokenStream;
17 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
18 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
19 import org.apache.lucene.document.Document;
20 import org.apache.lucene.document.Field;
21 import org.apache.lucene.document.Field.Index;
22 import org.apache.lucene.document.Field.Store;
23 import org.apache.lucene.index.CorruptIndexException;
24 import org.apache.lucene.index.FieldInfo.IndexOptions;
25 import org.apache.lucene.index.IndexReader;
26 import org.apache.lucene.index.IndexWriter;
27 import org.apache.lucene.index.IndexWriterConfig;
28 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
29 import org.apache.lucene.index.LogByteSizeMergePolicy;
30 import org.apache.lucene.index.Term;
31 import org.apache.lucene.index.TermDocs;
32 import org.apache.lucene.index.TermEnum;
33 import org.apache.lucene.store.AlreadyClosedException;
34 import org.apache.lucene.store.Directory;
35 import org.apache.lucene.store.LockObtainFailedException;
36 import org.apache.lucene.store.NativeFSLockFactory;
37 import org.apache.lucene.store.SimpleFSLockFactory;
38 import org.apache.lucene.util.Version;
40 import org.apache.lucene.facet.taxonomy.CategoryPath;
41 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
42 import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
43 import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
44 import org.apache.lucene.facet.taxonomy.writercache.cl2o.Cl2oTaxonomyWriterCache;
45 import org.apache.lucene.facet.taxonomy.writercache.lru.LruTaxonomyWriterCache;
48 * Licensed to the Apache Software Foundation (ASF) under one or more
49 * contributor license agreements. See the NOTICE file distributed with
50 * this work for additional information regarding copyright ownership.
51 * The ASF licenses this file to You under the Apache License, Version 2.0
52 * (the "License"); you may not use this file except in compliance with
53 * the License. You may obtain a copy of the License at
55 * http://www.apache.org/licenses/LICENSE-2.0
57 * Unless required by applicable law or agreed to in writing, software
58 * distributed under the License is distributed on an "AS IS" BASIS,
59 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
60 * See the License for the specific language governing permissions and
61 * limitations under the License.
65 * {@link TaxonomyWriter} which uses a {@link Directory} to store the taxonomy
66 * information on disk, and keeps an additional in-memory cache of some or all
69 * In addition to the permanently-stored information in the {@link Directory},
70 * efficiency dictates that we also keep an in-memory cache of <B>recently
71 * seen</B> or <B>all</B> categories, so that we do not need to go back to disk
72 * for every category addition to see which ordinal this category already has,
73 * if any. A {@link TaxonomyWriterCache} object determines the specific caching
76 * This class offers some hooks for extending classes to control the
77 * {@link IndexWriter} instance that is used. See {@link #openIndexWriter}.
79 * @lucene.experimental
81 public class DirectoryTaxonomyWriter implements TaxonomyWriter {
84 * Property name of user commit data that contains the creation time of a taxonomy index.
86 * Applications making use of {@link TaxonomyWriter#commit(Map)} should not use this
87 * particular property name.
89 public static final String INDEX_CREATE_TIME = "index.create.time";
91 private IndexWriter indexWriter;
93 private char delimiter = Consts.DEFAULT_DELIMITER;
94 private SinglePositionTokenStream parentStream = new SinglePositionTokenStream(Consts.PAYLOAD_PARENT);
95 private Field parentStreamField;
96 private Field fullPathField;
98 private TaxonomyWriterCache cache;
100 * We call the cache "complete" if we know that every category in our
101 * taxonomy is in the cache. When the cache is <B>not</B> complete, and
102 * we can't find a category in the cache, we still need to look for it
103 * in the on-disk index; Therefore when the cache is not complete, we
104 * need to open a "reader" to the taxonomy index.
105 * The cache becomes incomplete if it was never filled with the existing
106 * categories, or if a put() to the cache ever returned true (meaning
107 * that some of the cached data was cleared).
109 private boolean cacheIsComplete;
110 private IndexReader reader;
111 private int cacheMisses;
114 * When a taxonomy is created, we mark that its create time should be committed in the
117 private String taxoIndexCreateTime = null;
120 * setDelimiter changes the character that the taxonomy uses in its internal
121 * storage as a delimiter between category components. Do not use this
122 * method unless you really know what you are doing. It has nothing to do
123 * with whatever character the application may be using to represent
124 * categories for its own use.
126 * If you do use this method, make sure you call it before any other methods
127 * that actually queries the taxonomy. Moreover, make sure you always pass
128 * the same delimiter for all LuceneTaxonomyWriter and LuceneTaxonomyReader
129 * objects you create for the same directory.
131 public void setDelimiter(char delimiter) {
133 this.delimiter = delimiter;
137 * Forcibly unlocks the taxonomy in the named directory.
139 * Caution: this should only be used by failure recovery code, when it is
140 * known that no other process nor thread is in fact currently accessing
143 * This method is unnecessary if your {@link Directory} uses a
144 * {@link NativeFSLockFactory} instead of the default
145 * {@link SimpleFSLockFactory}. When the "native" lock is used, a lock
146 * does not stay behind forever when the process using it dies.
148 public static void unlock(Directory directory) throws IOException {
149 IndexWriter.unlock(directory);
153 * Construct a Taxonomy writer.
156 * The {@link Directory} in which to store the taxonomy. Note that
157 * the taxonomy is written directly to that directory (not to a
158 * subdirectory of it).
160 * Specifies how to open a taxonomy for writing: <code>APPEND</code>
161 * means open an existing index for append (failing if the index does
162 * not yet exist). <code>CREATE</code> means create a new index (first
163 * deleting the old one if it already existed).
164 * <code>APPEND_OR_CREATE</code> appends to an existing index if there
165 * is one, otherwise it creates a new index.
167 * A {@link TaxonomyWriterCache} implementation which determines
168 * the in-memory caching policy. See for example
169 * {@link LruTaxonomyWriterCache} and {@link Cl2oTaxonomyWriterCache}.
170 * If null or missing, {@link #defaultTaxonomyWriterCache()} is used.
171 * @throws CorruptIndexException
172 * if the taxonomy is corrupted.
173 * @throws LockObtainFailedException
174 * if the taxonomy is locked by another writer. If it is known
175 * that no other concurrent writer is active, the lock might
176 * have been left around by an old dead process, and should be
177 * removed using {@link #unlock(Directory)}.
178 * @throws IOException
179 * if another error occurred.
181 public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode,
182 TaxonomyWriterCache cache)
183 throws CorruptIndexException, LockObtainFailedException,
186 if (!IndexReader.indexExists(directory) || openMode==OpenMode.CREATE) {
187 taxoIndexCreateTime = Long.toString(System.nanoTime());
190 indexWriter = openIndexWriter(directory, openMode);
193 parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream);
194 parentStreamField.setOmitNorms(true);
195 fullPathField = new Field(Consts.FULL, "", Store.YES, Index.NOT_ANALYZED_NO_NORMS);
196 fullPathField.setIndexOptions(IndexOptions.DOCS_ONLY);
198 this.nextID = indexWriter.maxDoc();
201 cache = defaultTaxonomyWriterCache();
206 cacheIsComplete = true;
207 // Make sure that the taxonomy always contain the root category
208 // with category id 0.
209 addCategory(new CategoryPath());
212 // There are some categories on the disk, which we have not yet
213 // read into the cache, and therefore the cache is incomplete.
214 // We chose not to read all the categories into the cache now,
215 // to avoid terrible performance when a taxonomy index is opened
216 // to add just a single category. We will do it later, after we
217 // notice a few cache misses.
218 cacheIsComplete = false;
224 * A hook for extensions of this class to provide their own
225 * {@link IndexWriter} implementation or instance. Extending classes can
226 * instantiate and configure the {@link IndexWriter} as they see fit,
227 * including setting a {@link org.apache.lucene.index.MergeScheduler}, or
228 * {@link org.apache.lucene.index.IndexDeletionPolicy}, different RAM size
230 * <b>NOTE:</b> the instance this method returns will be closed upon calling
231 * to {@link #close()}.
234 * the {@link Directory} on top of which an {@link IndexWriter}
237 * see {@link OpenMode}
239 protected IndexWriter openIndexWriter(Directory directory, OpenMode openMode)
241 // Make sure we use a MergePolicy which merges segments in-order and thus
242 // keeps the doc IDs ordered as well (this is crucial for the taxonomy
244 IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_30,
245 new KeywordAnalyzer()).setOpenMode(openMode).setMergePolicy(
246 new LogByteSizeMergePolicy());
247 return new IndexWriter(directory, config);
250 // Currently overridden by a unit test that verifies that every index we open
253 * Open an {@link IndexReader} from the {@link #indexWriter} member, by
254 * calling {@link IndexWriter#getReader()}. Extending classes can override
255 * this method to return their own {@link IndexReader}.
257 protected IndexReader openReader() throws IOException {
258 return IndexReader.open(indexWriter, true);
262 * Creates a new instance with a default cached as defined by
263 * {@link #defaultTaxonomyWriterCache()}.
265 public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode)
266 throws CorruptIndexException, LockObtainFailedException, IOException {
267 this(directory, openMode, defaultTaxonomyWriterCache());
271 * Defines the default {@link TaxonomyWriterCache} to use in constructors
272 * which do not specify one.
274 * The current default is {@link Cl2oTaxonomyWriterCache} constructed
275 * with the parameters (1024, 0.15f, 3), i.e., the entire taxonomy is
276 * cached in memory while building it.
278 public static TaxonomyWriterCache defaultTaxonomyWriterCache() {
279 return new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
282 // convenience constructors:
284 public DirectoryTaxonomyWriter(Directory d)
285 throws CorruptIndexException, LockObtainFailedException,
287 this(d, OpenMode.CREATE_OR_APPEND);
291 * Frees used resources as well as closes the underlying {@link IndexWriter},
292 * which commits whatever changes made to it to the underlying
295 public synchronized void close() throws CorruptIndexException, IOException {
296 if (indexWriter != null) {
297 if (taxoIndexCreateTime != null) {
298 indexWriter.commit(combinedCommitData(null));
299 taxoIndexCreateTime = null;
305 private void doClose() throws CorruptIndexException, IOException {
312 * Returns the number of memory bytes used by the cache.
313 * @return Number of cache bytes in memory, for CL2O only; zero otherwise.
315 public int getCacheMemoryUsage() {
317 if (this.cache == null || !(this.cache instanceof Cl2oTaxonomyWriterCache)) {
320 return ((Cl2oTaxonomyWriterCache)this.cache).getMemoryUsage();
324 * A hook for extending classes to close additional resources that were used.
325 * The default implementation closes the {@link IndexReader} as well as the
326 * {@link TaxonomyWriterCache} instances that were used. <br>
327 * <b>NOTE:</b> if you override this method, you should include a
328 * <code>super.closeResources()</code> call in your implementation.
330 protected synchronized void closeResources() throws IOException {
331 if (reader != null) {
342 * Look up the given category in the cache and/or the on-disk storage,
343 * returning the category's ordinal, or a negative number in case the
344 * category does not yet exist in the taxonomy.
346 protected int findCategory(CategoryPath categoryPath) throws IOException {
347 // If we can find the category in our cache, we can return the
348 // response directly from it:
349 int res = cache.get(categoryPath);
353 // If we know that the cache is complete, i.e., contains every category
354 // which exists, we can return -1 immediately. However, if the cache is
355 // not complete, we need to check the disk.
356 if (cacheIsComplete) {
360 // After a few cache misses, it makes sense to read all the categories
361 // from disk and into the cache. The reason not to do this on the first
362 // cache miss (or even when opening the writer) is that it will
363 // significantly slow down the case when a taxonomy is opened just to
364 // add one category. The idea only spending a long time on reading
365 // after enough time was spent on cache misses is known as a "online
367 if (perhapsFillCache()) {
368 return cache.get(categoryPath);
371 // We need to get an answer from the on-disk index. If a reader
372 // is not yet open, do it now:
373 if (reader == null) {
374 reader = openReader();
377 TermDocs docs = reader.termDocs(new Term(Consts.FULL, categoryPath
378 .toString(delimiter)));
380 return -1; // category does not exist in taxonomy
382 // Note: we do NOT add to the cache the fact that the category
383 // does not exist. The reason is that our only use for this
384 // method is just before we actually add this category. If
385 // in the future this usage changes, we should consider caching
386 // the fact that the category is not in the taxonomy.
387 addToCache(categoryPath, docs.doc());
392 * Look up the given prefix of the given category in the cache and/or the
393 * on-disk storage, returning that prefix's ordinal, or a negative number in
394 * case the category does not yet exist in the taxonomy.
396 private int findCategory(CategoryPath categoryPath, int prefixLen)
398 int res = cache.get(categoryPath, prefixLen);
402 if (cacheIsComplete) {
406 if (perhapsFillCache()) {
407 return cache.get(categoryPath, prefixLen);
409 if (reader == null) {
410 reader = openReader();
412 TermDocs docs = reader.termDocs(new Term(Consts.FULL, categoryPath
413 .toString(delimiter, prefixLen)));
415 return -1; // category does not exist in taxonomy
417 addToCache(categoryPath, prefixLen, docs.doc());
421 // TODO (Facet): addCategory() is synchronized. This means that if indexing is
422 // multi-threaded, a new category that needs to be written to disk (and
423 // potentially even trigger a lengthy merge) locks out other addCategory()
424 // calls - even those which could immediately return a cached value.
425 // We definitely need to fix this situation!
426 public synchronized int addCategory(CategoryPath categoryPath) throws IOException {
428 // If the category is already in the cache and/or the taxonomy, we
429 // should return its existing ordinal:
430 int res = findCategory(categoryPath);
432 // This is a new category, and we need to insert it into the index
433 // (and the cache). Actually, we might also need to add some of
434 // the category's ancestors before we can add the category itself
435 // (while keeping the invariant that a parent is always added to
436 // the taxonomy before its child). internalAddCategory() does all
438 res = internalAddCategory(categoryPath, categoryPath.length());
445 * Add a new category into the index (and the cache), and return its new
448 * Actually, we might also need to add some of the category's ancestors
449 * before we can add the category itself (while keeping the invariant that a
450 * parent is always added to the taxonomy before its child). We do this by
453 private int internalAddCategory(CategoryPath categoryPath, int length)
454 throws CorruptIndexException, IOException {
456 // Find our parent's ordinal (recursively adding the parent category
457 // to the taxonomy if it's not already there). Then add the parent
458 // ordinal as payloads (rather than a stored field; payloads can be
459 // more efficiently read into memory in bulk by LuceneTaxonomyReader)
462 parent = findCategory(categoryPath, length - 1);
464 parent = internalAddCategory(categoryPath, length - 1);
466 } else if (length == 1) {
467 parent = TaxonomyReader.ROOT_ORDINAL;
469 parent = TaxonomyReader.INVALID_ORDINAL;
471 int id = addCategoryDocument(categoryPath, length, parent);
477 * Verifies that this instance wasn't closed, or throws
478 * {@link AlreadyClosedException} if it is.
480 protected final void ensureOpen() {
481 if (indexWriter == null) {
482 throw new AlreadyClosedException("The taxonomy writer has already been closed");
486 // Note that the methods calling addCategoryDocument() are synchornized,
487 // so this method is effectively synchronized as well, but we'll add
488 // synchronized to be on the safe side, and we can reuse class-local objects
489 // instead of allocating them every time
490 protected synchronized int addCategoryDocument(CategoryPath categoryPath,
491 int length, int parent)
492 throws CorruptIndexException, IOException {
493 // Before Lucene 2.9, position increments >=0 were supported, so we
494 // added 1 to parent to allow the parent -1 (the parent of the root).
495 // Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is
496 // no longer enough, since 0 is not encoded consistently either (see
497 // comment in SinglePositionTokenStream). But because we must be
498 // backward-compatible with existing indexes, we can't just fix what
499 // we write here (e.g., to write parent+2), and need to do a workaround
500 // in the reader (which knows that anyway only category 0 has a parent
502 parentStream.set(parent+1);
503 Document d = new Document();
504 d.add(parentStreamField);
506 fullPathField.setValue(categoryPath.toString(delimiter, length));
507 d.add(fullPathField);
509 // Note that we do no pass an Analyzer here because the fields that are
510 // added to the Document are untokenized or contains their own TokenStream.
511 // Therefore the IndexWriter's Analyzer has no effect.
512 indexWriter.addDocument(d);
515 addToCache(categoryPath, length, id);
517 // also add to the parent array
518 getParentArray().add(id, parent);
523 private static class SinglePositionTokenStream extends TokenStream {
524 private CharTermAttribute termAtt;
525 private PositionIncrementAttribute posIncrAtt;
526 private boolean returned;
527 public SinglePositionTokenStream(String word) {
528 termAtt = addAttribute(CharTermAttribute.class);
529 posIncrAtt = addAttribute(PositionIncrementAttribute.class);
530 termAtt.setEmpty().append(word);
534 * Set the value we want to keep, as the position increment.
535 * Note that when TermPositions.nextPosition() is later used to
536 * retrieve this value, val-1 will be returned, not val.
538 * IMPORTANT NOTE: Before Lucene 2.9, val>=0 were safe (for val==0,
539 * the retrieved position would be -1). But starting with Lucene 2.9,
540 * this unfortunately changed, and only val>0 are safe. val=0 can
541 * still be used, but don't count on the value you retrieve later
542 * (it could be 0 or -1, depending on circumstances or versions).
543 * This change is described in Lucene's JIRA: LUCENE-1542.
545 public void set(int val) {
546 posIncrAtt.setPositionIncrement(val);
550 public boolean incrementToken() throws IOException {
559 private void addToCache(CategoryPath categoryPath, int id)
560 throws CorruptIndexException, IOException {
561 if (cache.put(categoryPath, id)) {
562 // If cache.put() returned true, it means the cache was limited in
563 // size, became full, so parts of it had to be cleared.
564 // Unfortunately we don't know which part was cleared - it is
565 // possible that a relatively-new category that hasn't yet been
566 // committed to disk (and therefore isn't yet visible in our
567 // "reader") was deleted from the cache, and therefore we must
568 // now refresh the reader.
569 // Because this is a slow operation, cache implementations are
570 // expected not to delete entries one-by-one but rather in bulk
571 // (LruTaxonomyWriterCache removes the 2/3rd oldest entries).
573 cacheIsComplete = false;
577 private void addToCache(CategoryPath categoryPath, int prefixLen, int id)
578 throws CorruptIndexException, IOException {
579 if (cache.put(categoryPath, prefixLen, id)) {
581 cacheIsComplete = false;
585 private synchronized void refreshReader() throws IOException {
586 if (reader != null) {
587 IndexReader r2 = IndexReader.openIfChanged(reader);
596 * Calling commit() ensures that all the categories written so far are
597 * visible to a reader that is opened (or reopened) after that call.
598 * When the index is closed(), commit() is also implicitly done.
599 * See {@link TaxonomyWriter#commit()}
601 public synchronized void commit() throws CorruptIndexException, IOException {
603 if (taxoIndexCreateTime != null) {
604 indexWriter.commit(combinedCommitData(null));
605 taxoIndexCreateTime = null;
607 indexWriter.commit();
613 * Combine original user data with that of the taxonomy creation time
615 private Map<String,String> combinedCommitData(Map<String,String> userData) {
616 Map<String,String> m = new HashMap<String, String>();
617 if (userData != null) {
620 m.put(INDEX_CREATE_TIME, taxoIndexCreateTime);
625 * Like commit(), but also store properties with the index. These properties
626 * are retrievable by {@link DirectoryTaxonomyReader#getCommitUserData}.
627 * See {@link TaxonomyWriter#commit(Map)}.
629 public synchronized void commit(Map<String,String> commitUserData) throws CorruptIndexException, IOException {
631 if (taxoIndexCreateTime != null) {
632 indexWriter.commit(combinedCommitData(commitUserData));
633 taxoIndexCreateTime = null;
635 indexWriter.commit(commitUserData);
641 * prepare most of the work needed for a two-phase commit.
642 * See {@link IndexWriter#prepareCommit}.
644 public synchronized void prepareCommit() throws CorruptIndexException, IOException {
646 if (taxoIndexCreateTime != null) {
647 indexWriter.prepareCommit(combinedCommitData(null));
648 taxoIndexCreateTime = null;
650 indexWriter.prepareCommit();
655 * Like above, and also prepares to store user data with the index.
656 * See {@link IndexWriter#prepareCommit(Map)}
658 public synchronized void prepareCommit(Map<String,String> commitUserData) throws CorruptIndexException, IOException {
660 if (taxoIndexCreateTime != null) {
661 indexWriter.prepareCommit(combinedCommitData(commitUserData));
662 taxoIndexCreateTime = null;
664 indexWriter.prepareCommit(commitUserData);
669 * getSize() returns the number of categories in the taxonomy.
671 * Because categories are numbered consecutively starting with 0, it means
672 * the taxonomy contains ordinals 0 through getSize()-1.
674 * Note that the number returned by getSize() is often slightly higher than
675 * the number of categories inserted into the taxonomy; This is because when
676 * a category is added to the taxonomy, its ancestors are also added
677 * automatically (including the root, which always get ordinal 0).
679 synchronized public int getSize() {
681 return indexWriter.maxDoc();
684 private boolean alreadyCalledFillCache = false;
687 * Set the number of cache misses before an attempt is made to read the
688 * entire taxonomy into the in-memory cache.
690 * LuceneTaxonomyWriter holds an in-memory cache of recently seen
691 * categories to speed up operation. On each cache-miss, the on-disk index
692 * needs to be consulted. When an existing taxonomy is opened, a lot of
693 * slow disk reads like that are needed until the cache is filled, so it
694 * is more efficient to read the entire taxonomy into memory at once.
695 * We do this complete read after a certain number (defined by this method)
698 * If the number is set to <CODE>0</CODE>, the entire taxonomy is read
699 * into the cache on first use, without fetching individual categories
702 * Note that if the memory cache of choice is limited in size, and cannot
703 * hold the entire content of the on-disk taxonomy, then it is never
704 * read in its entirety into the cache, regardless of the setting of this
707 public void setCacheMissesUntilFill(int i) {
709 cacheMissesUntilFill = i;
712 private int cacheMissesUntilFill = 11;
714 private boolean perhapsFillCache() throws IOException {
715 // Note: we assume that we're only called when cacheIsComplete==false.
716 // TODO (Facet): parametrize this criterion:
717 if (cacheMisses < cacheMissesUntilFill) {
720 // If the cache was already filled (or we decided not to fill it because
721 // there was no room), there is no sense in trying it again.
722 if (alreadyCalledFillCache) {
725 alreadyCalledFillCache = true;
726 // TODO (Facet): we should probably completely clear the cache before starting
728 if (reader == null) {
729 reader = openReader();
732 if (!cache.hasRoom(reader.numDocs())) {
736 CategoryPath cp = new CategoryPath();
737 TermDocs td = reader.termDocs();
738 Term fullPathTerm = new Term(Consts.FULL);
739 String field = fullPathTerm.field(); // needed so we can later use !=
740 TermEnum terms = reader.terms(fullPathTerm);
741 // The check is done here to avoid checking it on every iteration of the
742 // below loop. A null term wlil be returned if there are no terms in the
743 // lexicon, or after the Consts.FULL term. However while the loop is
744 // executed we're safe, because we only iterate as long as there are next()
746 if (terms.term() != null) {
748 Term t = terms.term();
749 if (t.field() != field) break;
750 // Since we guarantee uniqueness of categories, each term has exactly
751 // one document. Also, since we do not allow removing categories (and
752 // hence documents), there are no deletions in the index. Therefore, it
753 // is sufficient to call next(), and then doc(), exactly once with no
754 // 'validation' checks.
758 cp.add(t.text(), delimiter);
759 cache.put(cp, td.doc());
760 } while (terms.next());
763 cacheIsComplete = true;
764 // No sense to keep the reader open - we will not need to read from it
765 // if everything is in the cache.
771 private ParentArray parentArray;
772 private synchronized ParentArray getParentArray() throws IOException {
773 if (parentArray==null) {
774 if (reader == null) {
775 reader = openReader();
777 parentArray = new ParentArray();
778 parentArray.refresh(reader);
782 public int getParent(int ordinal) throws IOException {
784 // Note: the following if() just enforces that a user can never ask
785 // for the parent of a nonexistant category - even if the parent array
786 // was allocated bigger than it really needs to be.
787 if (ordinal >= getSize()) {
788 throw new ArrayIndexOutOfBoundsException();
790 return getParentArray().getArray()[ordinal];
794 * Take all the categories of one or more given taxonomies, and add them to
795 * the main taxonomy (this), if they are not already there.
797 * Additionally, fill a <I>mapping</I> for each of the added taxonomies,
798 * mapping its ordinals to the ordinals in the enlarged main taxonomy.
799 * These mapping are saved into an array of OrdinalMap objects given by the
800 * user, one for each of the given taxonomies (not including "this", the main
801 * taxonomy). Often the first of these will be a MemoryOrdinalMap and the
802 * others will be a DiskOrdinalMap - see discussion in {OrdinalMap}.
804 * Note that the taxonomies to be added are given as Directory objects,
805 * not opened TaxonomyReader/TaxonomyWriter objects, so if any of them are
806 * currently managed by an open TaxonomyWriter, make sure to commit() (or
807 * close()) it first. The main taxonomy (this) is an open TaxonomyWriter,
808 * and does not need to be commit()ed before this call.
810 public void addTaxonomies(Directory[] taxonomies, OrdinalMap[] ordinalMaps) throws IOException {
812 // To prevent us stepping on the rest of this class's decisions on when
813 // to open a reader, and when not, we'll be opening a new reader instead
814 // of using the existing "reader" object:
815 IndexReader mainreader = openReader();
816 TermEnum mainte = mainreader.terms(new Term(Consts.FULL));
818 IndexReader[] otherreaders = new IndexReader[taxonomies.length];
819 TermEnum[] othertes = new TermEnum[taxonomies.length];
820 for (int i=0; i<taxonomies.length; i++) {
821 otherreaders[i] = IndexReader.open(taxonomies[i]);
822 othertes[i] = otherreaders[i].terms(new Term(Consts.FULL));
823 // Also tell the ordinal maps their expected sizes:
824 ordinalMaps[i].setSize(otherreaders[i].numDocs());
827 CategoryPath cp = new CategoryPath();
829 // We keep a "current" cursor over the alphabetically-ordered list of
830 // categories in each taxonomy. We start the cursor on the first
831 // (alphabetically) category of each taxonomy:
834 String[] currentOthers = new String[taxonomies.length];
835 currentMain = nextTE(mainte);
836 int otherTaxonomiesLeft = 0;
837 for (int i=0; i<taxonomies.length; i++) {
838 currentOthers[i] = nextTE(othertes[i]);
839 if (currentOthers[i]!=null) {
840 otherTaxonomiesLeft++;
844 // And then, at each step look at the first (alphabetically) of the
845 // current taxonomies.
846 // NOTE: The most efficient way we could have done this is using a
847 // PriorityQueue. But for simplicity, and assuming that usually we'll
848 // have a very small number of other taxonomies (often just 1), we use
849 // a more naive algorithm (o(ntaxonomies) instead of o(ln ntaxonomies)
852 while (otherTaxonomiesLeft>0) {
854 for (int i=0; i<taxonomies.length; i++) {
855 if (currentOthers[i]==null) continue;
856 if (first==null || first.compareTo(currentOthers[i])>0) {
857 first = currentOthers[i];
861 if (currentMain==null || (comp = currentMain.compareTo(first))>0) {
862 // If 'first' is before currentMain, or currentMain is null,
863 // then 'first' is a new category and we need to add it to the
864 // main taxonomy. Then for all taxonomies with this 'first'
865 // category, we need to add the new category number to their
866 // map, and move to the next category in all of them.
868 cp.add(first, delimiter);
869 // We can call internalAddCategory() instead of addCategory()
870 // because we know the category hasn't been seen yet.
871 int newordinal = internalAddCategory(cp, cp.length());
872 // TODO (Facet): we already had this term in our hands before, in nextTE...
873 Term t = new Term(Consts.FULL, first);
874 for (int i=0; i<taxonomies.length; i++) {
875 if (first.equals(currentOthers[i])) {
876 // remember the remapping of this ordinal. Note how
877 // this requires reading a posting list from the index -
878 // but since we do this in lexical order of terms, just
879 // like Lucene's merge works, we hope there are few seeks.
880 // TODO (Facet): is there a quicker way? E.g., not specifying the
881 // next term by name every time?
882 TermDocs td = otherreaders[i].termDocs(t);
883 td.next(); // TODO (Facet): check?
884 int origordinal = td.doc();
885 ordinalMaps[i].addMapping(origordinal, newordinal);
886 // and move to the next category in the i'th taxonomy
887 currentOthers[i] = nextTE(othertes[i]);
888 if (currentOthers[i]==null) {
889 otherTaxonomiesLeft--;
893 } else if (comp==0) {
894 // 'first' and currentMain are the same, so both the main and some
895 // other taxonomies need to be moved, but a category doesn't need
896 // to be added because it already existed in the main taxonomy.
898 // TODO (Facet): Again, is there a quicker way?
899 Term t = new Term(Consts.FULL, first);
900 TermDocs td = mainreader.termDocs(t);
901 td.next(); // TODO (Facet): check?
902 int newordinal = td.doc();
904 currentMain = nextTE(mainte);
905 for (int i=0; i<taxonomies.length; i++) {
906 if (first.equals(currentOthers[i])) {
907 // TODO (Facet): again, is there a quicker way?
908 td = otherreaders[i].termDocs(t);
909 td.next(); // TODO (Facet): check?
910 int origordinal = td.doc();
911 ordinalMaps[i].addMapping(origordinal, newordinal);
913 // and move to the next category
914 currentOthers[i] = nextTE(othertes[i]);
915 if (currentOthers[i]==null) {
916 otherTaxonomiesLeft--;
920 } else /* comp > 0 */ {
921 // The currentMain doesn't appear in any of the other taxonomies -
922 // we don't need to do anything, just continue to the next one
923 currentMain = nextTE(mainte);
927 // Close all the readers we've opened, and also tell the ordinal maps
928 // we're done adding to them
930 for (int i=0; i<taxonomies.length; i++) {
931 otherreaders[i].close();
932 // We never actually added a mapping for the root ordinal - let's do
933 // it now, just so that the map is complete (every ordinal between 0
934 // and size-1 is remapped)
935 ordinalMaps[i].addMapping(0, 0);
936 ordinalMaps[i].addDone();
941 * Mapping from old ordinal to new ordinals, used when merging indexes
942 * wit separate taxonomies.
944 * addToTaxonomies() merges one or more taxonomies into the given taxonomy
945 * (this). An OrdinalMap is filled for each of the added taxonomies,
946 * containing the new ordinal (in the merged taxonomy) of each of the
947 * categories in the old taxonomy.
949 * There exist two implementations of OrdinalMap: MemoryOrdinalMap and
950 * DiskOrdinalMap. As their names suggest, the former keeps the map in
951 * memory and the latter in a temporary disk file. Because these maps will
952 * later be needed one by one (to remap the counting lists), not all at the
953 * same time, it is recommended to put the first taxonomy's map in memory,
954 * and all the rest on disk (later to be automatically read into memory one
955 * by one, when needed).
957 public static interface OrdinalMap {
959 * Set the size of the map. This MUST be called before addMapping().
960 * It is assumed (but not verified) that addMapping() will then be
961 * called exactly 'size' times, with different origOrdinals between 0
964 public void setSize(int size) throws IOException;
965 public void addMapping(int origOrdinal, int newOrdinal) throws IOException;
967 * Call addDone() to say that all addMapping() have been done.
968 * In some implementations this might free some resources.
970 public void addDone() throws IOException;
972 * Return the map from the taxonomy's original (consecutive) ordinals
973 * to the new taxonomy's ordinals. If the map has to be read from disk
974 * and ordered appropriately, it is done when getMap() is called.
975 * getMap() should only be called once, and only when the map is actually
976 * needed. Calling it will also free all resources that the map might
977 * be holding (such as temporary disk space), other than the returned int[].
979 public int[] getMap() throws IOException;
983 * {@link OrdinalMap} maintained in memory
985 public static final class MemoryOrdinalMap implements OrdinalMap {
987 public void setSize(int taxonomySize) {
988 map = new int[taxonomySize];
990 public void addMapping(int origOrdinal, int newOrdinal) {
991 map[origOrdinal] = newOrdinal;
993 public void addDone() { /* nothing to do */ }
994 public int[] getMap() {
1000 * {@link OrdinalMap} maintained on file system
1002 public static final class DiskOrdinalMap implements OrdinalMap {
1004 DataOutputStream out;
1006 public DiskOrdinalMap(File tmpfile) throws FileNotFoundException {
1007 this.tmpfile = tmpfile;
1008 out = new DataOutputStream(new BufferedOutputStream(
1009 new FileOutputStream(tmpfile)));
1012 public void addMapping(int origOrdinal, int newOrdinal) throws IOException {
1013 out.writeInt(origOrdinal);
1014 out.writeInt(newOrdinal);
1017 public void setSize(int taxonomySize) throws IOException {
1018 out.writeInt(taxonomySize);
1021 public void addDone() throws IOException {
1030 public int[] getMap() throws IOException {
1034 addDone(); // in case this wasn't previously called
1035 DataInputStream in = new DataInputStream(new BufferedInputStream(
1036 new FileInputStream(tmpfile)));
1037 map = new int[in.readInt()];
1038 // NOTE: The current code assumes here that the map is complete,
1039 // i.e., every ordinal gets one and exactly one value. Otherwise,
1040 // we may run into an EOF here, or vice versa, not read everything.
1041 for (int i=0; i<map.length; i++) {
1042 int origordinal = in.readInt();
1043 int newordinal = in.readInt();
1044 map[origordinal] = newordinal;
1047 // Delete the temporary file, which is no longer needed.
1048 if (!tmpfile.delete()) {
1049 tmpfile.deleteOnExit();
1055 private static final String nextTE(TermEnum te) throws IOException {
1058 // If our enumeration reached a different field, we're done. Note
1059 // how we're allowed compare string references, rather than the
1060 // actual string's contents.
1061 if (t.field()==Consts.FULL) {
1070 * Rollback changes to the taxonomy writer and closes the instance. Following
1071 * this method the instance becomes unusable (calling any of its API methods
1072 * will yield an {@link AlreadyClosedException}).
1074 public void rollback() throws IOException {
1076 indexWriter.rollback();
1077 // since IndexWriter.rollback() closes the IW instance, we should close too.