X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java diff --git a/lucene-java-3.5.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java b/lucene-java-3.5.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java new file mode 100644 index 0000000..aceae06 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java @@ -0,0 +1,1081 @@ +package org.apache.lucene.facet.taxonomy.directory; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.KeywordAnalyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.LogByteSizeMergePolicy; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.store.NativeFSLockFactory; +import org.apache.lucene.store.SimpleFSLockFactory; +import org.apache.lucene.util.Version; + +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache; +import org.apache.lucene.facet.taxonomy.writercache.cl2o.Cl2oTaxonomyWriterCache; +import org.apache.lucene.facet.taxonomy.writercache.lru.LruTaxonomyWriterCache; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * {@link TaxonomyWriter} which uses a {@link Directory} to store the taxonomy + * information on disk, and keeps an additional in-memory cache of some or all + * categories. + *

+ * In addition to the permanently-stored information in the {@link Directory}, + * efficiency dictates that we also keep an in-memory cache of recently + * seen or all categories, so that we do not need to go back to disk + * for every category addition to see which ordinal this category already has, + * if any. A {@link TaxonomyWriterCache} object determines the specific caching + * algorithm used. + *

+ * This class offers some hooks for extending classes to control the + * {@link IndexWriter} instance that is used. See {@link #openIndexWriter}. + * + * @lucene.experimental + */ +public class DirectoryTaxonomyWriter implements TaxonomyWriter { + + /** + * Property name of user commit data that contains the creation time of a taxonomy index. + *

+ * Applications making use of {@link TaxonomyWriter#commit(Map)} should not use this + * particular property name. + */ + public static final String INDEX_CREATE_TIME = "index.create.time"; + + private IndexWriter indexWriter; + private int nextID; + private char delimiter = Consts.DEFAULT_DELIMITER; + private SinglePositionTokenStream parentStream = new SinglePositionTokenStream(Consts.PAYLOAD_PARENT); + private Field parentStreamField; + private Field fullPathField; + + private TaxonomyWriterCache cache; + /** + * We call the cache "complete" if we know that every category in our + * taxonomy is in the cache. When the cache is not complete, and + * we can't find a category in the cache, we still need to look for it + * in the on-disk index; Therefore when the cache is not complete, we + * need to open a "reader" to the taxonomy index. + * The cache becomes incomplete if it was never filled with the existing + * categories, or if a put() to the cache ever returned true (meaning + * that some of the cached data was cleared). + */ + private boolean cacheIsComplete; + private IndexReader reader; + private int cacheMisses; + + /** + * When a taxonomy is created, we mark that its create time should be committed in the + * next commit. + */ + private String taxoIndexCreateTime = null; + + /** + * setDelimiter changes the character that the taxonomy uses in its internal + * storage as a delimiter between category components. Do not use this + * method unless you really know what you are doing. It has nothing to do + * with whatever character the application may be using to represent + * categories for its own use. + *

+ * If you do use this method, make sure you call it before any other methods + * that actually queries the taxonomy. Moreover, make sure you always pass + * the same delimiter for all LuceneTaxonomyWriter and LuceneTaxonomyReader + * objects you create for the same directory. + */ + public void setDelimiter(char delimiter) { + ensureOpen(); + this.delimiter = delimiter; + } + + /** + * Forcibly unlocks the taxonomy in the named directory. + *

+ * Caution: this should only be used by failure recovery code, when it is + * known that no other process nor thread is in fact currently accessing + * this taxonomy. + *

+ * This method is unnecessary if your {@link Directory} uses a + * {@link NativeFSLockFactory} instead of the default + * {@link SimpleFSLockFactory}. When the "native" lock is used, a lock + * does not stay behind forever when the process using it dies. + */ + public static void unlock(Directory directory) throws IOException { + IndexWriter.unlock(directory); + } + + /** + * Construct a Taxonomy writer. + * + * @param directory + * The {@link Directory} in which to store the taxonomy. Note that + * the taxonomy is written directly to that directory (not to a + * subdirectory of it). + * @param openMode + * Specifies how to open a taxonomy for writing: APPEND + * means open an existing index for append (failing if the index does + * not yet exist). CREATE means create a new index (first + * deleting the old one if it already existed). + * APPEND_OR_CREATE appends to an existing index if there + * is one, otherwise it creates a new index. + * @param cache + * A {@link TaxonomyWriterCache} implementation which determines + * the in-memory caching policy. See for example + * {@link LruTaxonomyWriterCache} and {@link Cl2oTaxonomyWriterCache}. + * If null or missing, {@link #defaultTaxonomyWriterCache()} is used. + * @throws CorruptIndexException + * if the taxonomy is corrupted. + * @throws LockObtainFailedException + * if the taxonomy is locked by another writer. If it is known + * that no other concurrent writer is active, the lock might + * have been left around by an old dead process, and should be + * removed using {@link #unlock(Directory)}. + * @throws IOException + * if another error occurred. + */ + public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode, + TaxonomyWriterCache cache) + throws CorruptIndexException, LockObtainFailedException, + IOException { + + if (!IndexReader.indexExists(directory) || openMode==OpenMode.CREATE) { + taxoIndexCreateTime = Long.toString(System.nanoTime()); + } + + indexWriter = openIndexWriter(directory, openMode); + reader = null; + + parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream); + parentStreamField.setOmitNorms(true); + fullPathField = new Field(Consts.FULL, "", Store.YES, Index.NOT_ANALYZED_NO_NORMS); + fullPathField.setIndexOptions(IndexOptions.DOCS_ONLY); + + this.nextID = indexWriter.maxDoc(); + + if (cache==null) { + cache = defaultTaxonomyWriterCache(); + } + this.cache = cache; + + if (nextID == 0) { + cacheIsComplete = true; + // Make sure that the taxonomy always contain the root category + // with category id 0. + addCategory(new CategoryPath()); + refreshReader(); + } else { + // There are some categories on the disk, which we have not yet + // read into the cache, and therefore the cache is incomplete. + // We chose not to read all the categories into the cache now, + // to avoid terrible performance when a taxonomy index is opened + // to add just a single category. We will do it later, after we + // notice a few cache misses. + cacheIsComplete = false; + } + cacheMisses = 0; + } + + /** + * A hook for extensions of this class to provide their own + * {@link IndexWriter} implementation or instance. Extending classes can + * instantiate and configure the {@link IndexWriter} as they see fit, + * including setting a {@link org.apache.lucene.index.MergeScheduler}, or + * {@link org.apache.lucene.index.IndexDeletionPolicy}, different RAM size + * etc.
+ * NOTE: the instance this method returns will be closed upon calling + * to {@link #close()}. + * + * @param directory + * the {@link Directory} on top of which an {@link IndexWriter} + * should be opened. + * @param openMode + * see {@link OpenMode} + */ + protected IndexWriter openIndexWriter(Directory directory, OpenMode openMode) + throws IOException { + // Make sure we use a MergePolicy which merges segments in-order and thus + // keeps the doc IDs ordered as well (this is crucial for the taxonomy + // index). + IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_30, + new KeywordAnalyzer()).setOpenMode(openMode).setMergePolicy( + new LogByteSizeMergePolicy()); + return new IndexWriter(directory, config); + } + + // Currently overridden by a unit test that verifies that every index we open + // is close()ed. + /** + * Open an {@link IndexReader} from the {@link #indexWriter} member, by + * calling {@link IndexWriter#getReader()}. Extending classes can override + * this method to return their own {@link IndexReader}. + */ + protected IndexReader openReader() throws IOException { + return IndexReader.open(indexWriter, true); + } + + /** + * Creates a new instance with a default cached as defined by + * {@link #defaultTaxonomyWriterCache()}. + */ + public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode) + throws CorruptIndexException, LockObtainFailedException, IOException { + this(directory, openMode, defaultTaxonomyWriterCache()); + } + + /** + * Defines the default {@link TaxonomyWriterCache} to use in constructors + * which do not specify one. + *

+ * The current default is {@link Cl2oTaxonomyWriterCache} constructed + * with the parameters (1024, 0.15f, 3), i.e., the entire taxonomy is + * cached in memory while building it. + */ + public static TaxonomyWriterCache defaultTaxonomyWriterCache() { + return new Cl2oTaxonomyWriterCache(1024, 0.15f, 3); + } + + // convenience constructors: + + public DirectoryTaxonomyWriter(Directory d) + throws CorruptIndexException, LockObtainFailedException, + IOException { + this(d, OpenMode.CREATE_OR_APPEND); + } + + /** + * Frees used resources as well as closes the underlying {@link IndexWriter}, + * which commits whatever changes made to it to the underlying + * {@link Directory}. + */ + public synchronized void close() throws CorruptIndexException, IOException { + if (indexWriter != null) { + if (taxoIndexCreateTime != null) { + indexWriter.commit(combinedCommitData(null)); + taxoIndexCreateTime = null; + } + doClose(); + } + } + + private void doClose() throws CorruptIndexException, IOException { + indexWriter.close(); + indexWriter = null; + closeResources(); + } + + /** + * Returns the number of memory bytes used by the cache. + * @return Number of cache bytes in memory, for CL2O only; zero otherwise. + */ + public int getCacheMemoryUsage() { + ensureOpen(); + if (this.cache == null || !(this.cache instanceof Cl2oTaxonomyWriterCache)) { + return 0; + } + return ((Cl2oTaxonomyWriterCache)this.cache).getMemoryUsage(); + } + + /** + * A hook for extending classes to close additional resources that were used. + * The default implementation closes the {@link IndexReader} as well as the + * {@link TaxonomyWriterCache} instances that were used.
+ * NOTE: if you override this method, you should include a + * super.closeResources() call in your implementation. + */ + protected synchronized void closeResources() throws IOException { + if (reader != null) { + reader.close(); + reader = null; + } + if (cache != null) { + cache.close(); + cache = null; + } + } + + /** + * Look up the given category in the cache and/or the on-disk storage, + * returning the category's ordinal, or a negative number in case the + * category does not yet exist in the taxonomy. + */ + protected int findCategory(CategoryPath categoryPath) throws IOException { + // If we can find the category in our cache, we can return the + // response directly from it: + int res = cache.get(categoryPath); + if (res >= 0) { + return res; + } + // If we know that the cache is complete, i.e., contains every category + // which exists, we can return -1 immediately. However, if the cache is + // not complete, we need to check the disk. + if (cacheIsComplete) { + return -1; + } + cacheMisses++; + // After a few cache misses, it makes sense to read all the categories + // from disk and into the cache. The reason not to do this on the first + // cache miss (or even when opening the writer) is that it will + // significantly slow down the case when a taxonomy is opened just to + // add one category. The idea only spending a long time on reading + // after enough time was spent on cache misses is known as a "online + // algorithm". + if (perhapsFillCache()) { + return cache.get(categoryPath); + } + + // We need to get an answer from the on-disk index. If a reader + // is not yet open, do it now: + if (reader == null) { + reader = openReader(); + } + + TermDocs docs = reader.termDocs(new Term(Consts.FULL, categoryPath + .toString(delimiter))); + if (!docs.next()) { + return -1; // category does not exist in taxonomy + } + // Note: we do NOT add to the cache the fact that the category + // does not exist. The reason is that our only use for this + // method is just before we actually add this category. If + // in the future this usage changes, we should consider caching + // the fact that the category is not in the taxonomy. + addToCache(categoryPath, docs.doc()); + return docs.doc(); + } + + /** + * Look up the given prefix of the given category in the cache and/or the + * on-disk storage, returning that prefix's ordinal, or a negative number in + * case the category does not yet exist in the taxonomy. + */ + private int findCategory(CategoryPath categoryPath, int prefixLen) + throws IOException { + int res = cache.get(categoryPath, prefixLen); + if (res >= 0) { + return res; + } + if (cacheIsComplete) { + return -1; + } + cacheMisses++; + if (perhapsFillCache()) { + return cache.get(categoryPath, prefixLen); + } + if (reader == null) { + reader = openReader(); + } + TermDocs docs = reader.termDocs(new Term(Consts.FULL, categoryPath + .toString(delimiter, prefixLen))); + if (!docs.next()) { + return -1; // category does not exist in taxonomy + } + addToCache(categoryPath, prefixLen, docs.doc()); + return docs.doc(); + } + + // TODO (Facet): addCategory() is synchronized. This means that if indexing is + // multi-threaded, a new category that needs to be written to disk (and + // potentially even trigger a lengthy merge) locks out other addCategory() + // calls - even those which could immediately return a cached value. + // We definitely need to fix this situation! + public synchronized int addCategory(CategoryPath categoryPath) throws IOException { + ensureOpen(); + // If the category is already in the cache and/or the taxonomy, we + // should return its existing ordinal: + int res = findCategory(categoryPath); + if (res < 0) { + // This is a new category, and we need to insert it into the index + // (and the cache). Actually, we might also need to add some of + // the category's ancestors before we can add the category itself + // (while keeping the invariant that a parent is always added to + // the taxonomy before its child). internalAddCategory() does all + // this recursively: + res = internalAddCategory(categoryPath, categoryPath.length()); + } + return res; + + } + + /** + * Add a new category into the index (and the cache), and return its new + * ordinal. + *

+ * Actually, we might also need to add some of the category's ancestors + * before we can add the category itself (while keeping the invariant that a + * parent is always added to the taxonomy before its child). We do this by + * recursion. + */ + private int internalAddCategory(CategoryPath categoryPath, int length) + throws CorruptIndexException, IOException { + + // Find our parent's ordinal (recursively adding the parent category + // to the taxonomy if it's not already there). Then add the parent + // ordinal as payloads (rather than a stored field; payloads can be + // more efficiently read into memory in bulk by LuceneTaxonomyReader) + int parent; + if (length > 1) { + parent = findCategory(categoryPath, length - 1); + if (parent < 0) { + parent = internalAddCategory(categoryPath, length - 1); + } + } else if (length == 1) { + parent = TaxonomyReader.ROOT_ORDINAL; + } else { + parent = TaxonomyReader.INVALID_ORDINAL; + } + int id = addCategoryDocument(categoryPath, length, parent); + + return id; + } + + /** + * Verifies that this instance wasn't closed, or throws + * {@link AlreadyClosedException} if it is. + */ + protected final void ensureOpen() { + if (indexWriter == null) { + throw new AlreadyClosedException("The taxonomy writer has already been closed"); + } + } + + // Note that the methods calling addCategoryDocument() are synchornized, + // so this method is effectively synchronized as well, but we'll add + // synchronized to be on the safe side, and we can reuse class-local objects + // instead of allocating them every time + protected synchronized int addCategoryDocument(CategoryPath categoryPath, + int length, int parent) + throws CorruptIndexException, IOException { + // Before Lucene 2.9, position increments >=0 were supported, so we + // added 1 to parent to allow the parent -1 (the parent of the root). + // Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is + // no longer enough, since 0 is not encoded consistently either (see + // comment in SinglePositionTokenStream). But because we must be + // backward-compatible with existing indexes, we can't just fix what + // we write here (e.g., to write parent+2), and need to do a workaround + // in the reader (which knows that anyway only category 0 has a parent + // -1). + parentStream.set(parent+1); + Document d = new Document(); + d.add(parentStreamField); + + fullPathField.setValue(categoryPath.toString(delimiter, length)); + d.add(fullPathField); + + // Note that we do no pass an Analyzer here because the fields that are + // added to the Document are untokenized or contains their own TokenStream. + // Therefore the IndexWriter's Analyzer has no effect. + indexWriter.addDocument(d); + int id = nextID++; + + addToCache(categoryPath, length, id); + + // also add to the parent array + getParentArray().add(id, parent); + + return id; + } + + private static class SinglePositionTokenStream extends TokenStream { + private CharTermAttribute termAtt; + private PositionIncrementAttribute posIncrAtt; + private boolean returned; + public SinglePositionTokenStream(String word) { + termAtt = addAttribute(CharTermAttribute.class); + posIncrAtt = addAttribute(PositionIncrementAttribute.class); + termAtt.setEmpty().append(word); + returned = true; + } + /** + * Set the value we want to keep, as the position increment. + * Note that when TermPositions.nextPosition() is later used to + * retrieve this value, val-1 will be returned, not val. + *

+ * IMPORTANT NOTE: Before Lucene 2.9, val>=0 were safe (for val==0, + * the retrieved position would be -1). But starting with Lucene 2.9, + * this unfortunately changed, and only val>0 are safe. val=0 can + * still be used, but don't count on the value you retrieve later + * (it could be 0 or -1, depending on circumstances or versions). + * This change is described in Lucene's JIRA: LUCENE-1542. + */ + public void set(int val) { + posIncrAtt.setPositionIncrement(val); + returned = false; + } + @Override + public boolean incrementToken() throws IOException { + if (returned) { + return false; + } + returned = true; + return true; + } + } + + private void addToCache(CategoryPath categoryPath, int id) + throws CorruptIndexException, IOException { + if (cache.put(categoryPath, id)) { + // If cache.put() returned true, it means the cache was limited in + // size, became full, so parts of it had to be cleared. + // Unfortunately we don't know which part was cleared - it is + // possible that a relatively-new category that hasn't yet been + // committed to disk (and therefore isn't yet visible in our + // "reader") was deleted from the cache, and therefore we must + // now refresh the reader. + // Because this is a slow operation, cache implementations are + // expected not to delete entries one-by-one but rather in bulk + // (LruTaxonomyWriterCache removes the 2/3rd oldest entries). + refreshReader(); + cacheIsComplete = false; + } + } + + private void addToCache(CategoryPath categoryPath, int prefixLen, int id) + throws CorruptIndexException, IOException { + if (cache.put(categoryPath, prefixLen, id)) { + refreshReader(); + cacheIsComplete = false; + } + } + + private synchronized void refreshReader() throws IOException { + if (reader != null) { + IndexReader r2 = IndexReader.openIfChanged(reader); + if (r2 != null) { + reader.close(); + reader = r2; + } + } + } + + /** + * Calling commit() ensures that all the categories written so far are + * visible to a reader that is opened (or reopened) after that call. + * When the index is closed(), commit() is also implicitly done. + * See {@link TaxonomyWriter#commit()} + */ + public synchronized void commit() throws CorruptIndexException, IOException { + ensureOpen(); + if (taxoIndexCreateTime != null) { + indexWriter.commit(combinedCommitData(null)); + taxoIndexCreateTime = null; + } else { + indexWriter.commit(); + } + refreshReader(); + } + + /** + * Combine original user data with that of the taxonomy creation time + */ + private Map combinedCommitData(Map userData) { + Map m = new HashMap(); + if (userData != null) { + m.putAll(userData); + } + m.put(INDEX_CREATE_TIME, taxoIndexCreateTime); + return m; + } + + /** + * Like commit(), but also store properties with the index. These properties + * are retrievable by {@link DirectoryTaxonomyReader#getCommitUserData}. + * See {@link TaxonomyWriter#commit(Map)}. + */ + public synchronized void commit(Map commitUserData) throws CorruptIndexException, IOException { + ensureOpen(); + if (taxoIndexCreateTime != null) { + indexWriter.commit(combinedCommitData(commitUserData)); + taxoIndexCreateTime = null; + } else { + indexWriter.commit(commitUserData); + } + refreshReader(); + } + + /** + * prepare most of the work needed for a two-phase commit. + * See {@link IndexWriter#prepareCommit}. + */ + public synchronized void prepareCommit() throws CorruptIndexException, IOException { + ensureOpen(); + if (taxoIndexCreateTime != null) { + indexWriter.prepareCommit(combinedCommitData(null)); + taxoIndexCreateTime = null; + } else { + indexWriter.prepareCommit(); + } + } + + /** + * Like above, and also prepares to store user data with the index. + * See {@link IndexWriter#prepareCommit(Map)} + */ + public synchronized void prepareCommit(Map commitUserData) throws CorruptIndexException, IOException { + ensureOpen(); + if (taxoIndexCreateTime != null) { + indexWriter.prepareCommit(combinedCommitData(commitUserData)); + taxoIndexCreateTime = null; + } else { + indexWriter.prepareCommit(commitUserData); + } + } + + /** + * getSize() returns the number of categories in the taxonomy. + *

+ * Because categories are numbered consecutively starting with 0, it means + * the taxonomy contains ordinals 0 through getSize()-1. + *

+ * Note that the number returned by getSize() is often slightly higher than + * the number of categories inserted into the taxonomy; This is because when + * a category is added to the taxonomy, its ancestors are also added + * automatically (including the root, which always get ordinal 0). + */ + synchronized public int getSize() { + ensureOpen(); + return indexWriter.maxDoc(); + } + + private boolean alreadyCalledFillCache = false; + + /** + * Set the number of cache misses before an attempt is made to read the + * entire taxonomy into the in-memory cache. + *

+ * LuceneTaxonomyWriter holds an in-memory cache of recently seen + * categories to speed up operation. On each cache-miss, the on-disk index + * needs to be consulted. When an existing taxonomy is opened, a lot of + * slow disk reads like that are needed until the cache is filled, so it + * is more efficient to read the entire taxonomy into memory at once. + * We do this complete read after a certain number (defined by this method) + * of cache misses. + *

+ * If the number is set to 0, the entire taxonomy is read + * into the cache on first use, without fetching individual categories + * first. + *

+ * Note that if the memory cache of choice is limited in size, and cannot + * hold the entire content of the on-disk taxonomy, then it is never + * read in its entirety into the cache, regardless of the setting of this + * method. + */ + public void setCacheMissesUntilFill(int i) { + ensureOpen(); + cacheMissesUntilFill = i; + } + + private int cacheMissesUntilFill = 11; + + private boolean perhapsFillCache() throws IOException { + // Note: we assume that we're only called when cacheIsComplete==false. + // TODO (Facet): parametrize this criterion: + if (cacheMisses < cacheMissesUntilFill) { + return false; + } + // If the cache was already filled (or we decided not to fill it because + // there was no room), there is no sense in trying it again. + if (alreadyCalledFillCache) { + return false; + } + alreadyCalledFillCache = true; + // TODO (Facet): we should probably completely clear the cache before starting + // to read it? + if (reader == null) { + reader = openReader(); + } + + if (!cache.hasRoom(reader.numDocs())) { + return false; + } + + CategoryPath cp = new CategoryPath(); + TermDocs td = reader.termDocs(); + Term fullPathTerm = new Term(Consts.FULL); + String field = fullPathTerm.field(); // needed so we can later use != + TermEnum terms = reader.terms(fullPathTerm); + // The check is done here to avoid checking it on every iteration of the + // below loop. A null term wlil be returned if there are no terms in the + // lexicon, or after the Consts.FULL term. However while the loop is + // executed we're safe, because we only iterate as long as there are next() + // terms. + if (terms.term() != null) { + do { + Term t = terms.term(); + if (t.field() != field) break; + // Since we guarantee uniqueness of categories, each term has exactly + // one document. Also, since we do not allow removing categories (and + // hence documents), there are no deletions in the index. Therefore, it + // is sufficient to call next(), and then doc(), exactly once with no + // 'validation' checks. + td.seek(t); + td.next(); + cp.clear(); + cp.add(t.text(), delimiter); + cache.put(cp, td.doc()); + } while (terms.next()); + } + + cacheIsComplete = true; + // No sense to keep the reader open - we will not need to read from it + // if everything is in the cache. + reader.close(); + reader = null; + return true; + } + + private ParentArray parentArray; + private synchronized ParentArray getParentArray() throws IOException { + if (parentArray==null) { + if (reader == null) { + reader = openReader(); + } + parentArray = new ParentArray(); + parentArray.refresh(reader); + } + return parentArray; + } + public int getParent(int ordinal) throws IOException { + ensureOpen(); + // Note: the following if() just enforces that a user can never ask + // for the parent of a nonexistant category - even if the parent array + // was allocated bigger than it really needs to be. + if (ordinal >= getSize()) { + throw new ArrayIndexOutOfBoundsException(); + } + return getParentArray().getArray()[ordinal]; + } + + /** + * Take all the categories of one or more given taxonomies, and add them to + * the main taxonomy (this), if they are not already there. + *

+ * Additionally, fill a mapping for each of the added taxonomies, + * mapping its ordinals to the ordinals in the enlarged main taxonomy. + * These mapping are saved into an array of OrdinalMap objects given by the + * user, one for each of the given taxonomies (not including "this", the main + * taxonomy). Often the first of these will be a MemoryOrdinalMap and the + * others will be a DiskOrdinalMap - see discussion in {OrdinalMap}. + *

+ * Note that the taxonomies to be added are given as Directory objects, + * not opened TaxonomyReader/TaxonomyWriter objects, so if any of them are + * currently managed by an open TaxonomyWriter, make sure to commit() (or + * close()) it first. The main taxonomy (this) is an open TaxonomyWriter, + * and does not need to be commit()ed before this call. + */ + public void addTaxonomies(Directory[] taxonomies, OrdinalMap[] ordinalMaps) throws IOException { + ensureOpen(); + // To prevent us stepping on the rest of this class's decisions on when + // to open a reader, and when not, we'll be opening a new reader instead + // of using the existing "reader" object: + IndexReader mainreader = openReader(); + TermEnum mainte = mainreader.terms(new Term(Consts.FULL)); + + IndexReader[] otherreaders = new IndexReader[taxonomies.length]; + TermEnum[] othertes = new TermEnum[taxonomies.length]; + for (int i=0; i0) { + String first=null; + for (int i=0; i0) { + first = currentOthers[i]; + } + } + int comp = 0; + if (currentMain==null || (comp = currentMain.compareTo(first))>0) { + // If 'first' is before currentMain, or currentMain is null, + // then 'first' is a new category and we need to add it to the + // main taxonomy. Then for all taxonomies with this 'first' + // category, we need to add the new category number to their + // map, and move to the next category in all of them. + cp.clear(); + cp.add(first, delimiter); + // We can call internalAddCategory() instead of addCategory() + // because we know the category hasn't been seen yet. + int newordinal = internalAddCategory(cp, cp.length()); + // TODO (Facet): we already had this term in our hands before, in nextTE... + Term t = new Term(Consts.FULL, first); + for (int i=0; i 0 */ { + // The currentMain doesn't appear in any of the other taxonomies - + // we don't need to do anything, just continue to the next one + currentMain = nextTE(mainte); + } + } + + // Close all the readers we've opened, and also tell the ordinal maps + // we're done adding to them + mainreader.close(); + for (int i=0; i + * addToTaxonomies() merges one or more taxonomies into the given taxonomy + * (this). An OrdinalMap is filled for each of the added taxonomies, + * containing the new ordinal (in the merged taxonomy) of each of the + * categories in the old taxonomy. + *

+ * There exist two implementations of OrdinalMap: MemoryOrdinalMap and + * DiskOrdinalMap. As their names suggest, the former keeps the map in + * memory and the latter in a temporary disk file. Because these maps will + * later be needed one by one (to remap the counting lists), not all at the + * same time, it is recommended to put the first taxonomy's map in memory, + * and all the rest on disk (later to be automatically read into memory one + * by one, when needed). + */ + public static interface OrdinalMap { + /** + * Set the size of the map. This MUST be called before addMapping(). + * It is assumed (but not verified) that addMapping() will then be + * called exactly 'size' times, with different origOrdinals between 0 + * and size-1. + */ + public void setSize(int size) throws IOException; + public void addMapping(int origOrdinal, int newOrdinal) throws IOException; + /** + * Call addDone() to say that all addMapping() have been done. + * In some implementations this might free some resources. + */ + public void addDone() throws IOException; + /** + * Return the map from the taxonomy's original (consecutive) ordinals + * to the new taxonomy's ordinals. If the map has to be read from disk + * and ordered appropriately, it is done when getMap() is called. + * getMap() should only be called once, and only when the map is actually + * needed. Calling it will also free all resources that the map might + * be holding (such as temporary disk space), other than the returned int[]. + */ + public int[] getMap() throws IOException; + } + + /** + * {@link OrdinalMap} maintained in memory + */ + public static final class MemoryOrdinalMap implements OrdinalMap { + int[] map; + public void setSize(int taxonomySize) { + map = new int[taxonomySize]; + } + public void addMapping(int origOrdinal, int newOrdinal) { + map[origOrdinal] = newOrdinal; + } + public void addDone() { /* nothing to do */ } + public int[] getMap() { + return map; + } + } + + /** + * {@link OrdinalMap} maintained on file system + */ + public static final class DiskOrdinalMap implements OrdinalMap { + File tmpfile; + DataOutputStream out; + + public DiskOrdinalMap(File tmpfile) throws FileNotFoundException { + this.tmpfile = tmpfile; + out = new DataOutputStream(new BufferedOutputStream( + new FileOutputStream(tmpfile))); + } + + public void addMapping(int origOrdinal, int newOrdinal) throws IOException { + out.writeInt(origOrdinal); + out.writeInt(newOrdinal); + } + + public void setSize(int taxonomySize) throws IOException { + out.writeInt(taxonomySize); + } + + public void addDone() throws IOException { + if (out!=null) { + out.close(); + out = null; + } + } + + int[] map = null; + + public int[] getMap() throws IOException { + if (map!=null) { + return map; + } + addDone(); // in case this wasn't previously called + DataInputStream in = new DataInputStream(new BufferedInputStream( + new FileInputStream(tmpfile))); + map = new int[in.readInt()]; + // NOTE: The current code assumes here that the map is complete, + // i.e., every ordinal gets one and exactly one value. Otherwise, + // we may run into an EOF here, or vice versa, not read everything. + for (int i=0; i