X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/index/CategoryDocumentBuilder.java diff --git a/lucene-java-3.5.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/index/CategoryDocumentBuilder.java b/lucene-java-3.5.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/index/CategoryDocumentBuilder.java new file mode 100644 index 0000000..82531fd --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/facet/src/java/org/apache/lucene/facet/index/CategoryDocumentBuilder.java @@ -0,0 +1,295 @@ +package org.apache.lucene.facet.index; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +import org.apache.lucene.facet.index.attributes.CategoryAttribute; +import org.apache.lucene.facet.index.attributes.CategoryAttributesIterable; +import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy; +import org.apache.lucene.facet.index.categorypolicy.PathPolicy; +import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.index.streaming.CategoryAttributesStream; +import org.apache.lucene.facet.index.streaming.CategoryListTokenizer; +import org.apache.lucene.facet.index.streaming.CategoryParentsStream; +import org.apache.lucene.facet.index.streaming.CategoryTokenizer; +import org.apache.lucene.facet.index.streaming.CountingListTokenizer; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A utility class which allows attachment of {@link CategoryPath}s or + * {@link CategoryAttribute}s to a given document using a taxonomy.
+ * Construction could be done with either a given {@link FacetIndexingParams} or + * the default implementation {@link DefaultFacetIndexingParams}.
+ * A CategoryDocumentBuilder can be reused by repeatedly setting the categories + * and building the document. Categories are provided either as + * {@link CategoryAttribute} elements through {@link #setCategories(Iterable)}, + * or as {@link CategoryPath} elements through + * {@link #setCategoryPaths(Iterable)}. + *

+ * Note that both {@link #setCategories(Iterable)} and + * {@link #setCategoryPaths(Iterable)} return this + * {@link CategoryDocumentBuilder}, allowing the following pattern: {@code new + * CategoryDocumentBuilder(taxonomy, + * params).setCategories(categories).build(doc)}. + * + * @lucene.experimental + */ +public class CategoryDocumentBuilder { + + /** + * A {@link TaxonomyWriter} for adding categories and retrieving their + * ordinals. + */ + protected final TaxonomyWriter taxonomyWriter; + + /** + * Parameters to be used when indexing categories. + */ + protected final FacetIndexingParams indexingParams; + + /** + * A list of fields which is filled at ancestors' construction and used + * during {@link CategoryDocumentBuilder#build(Document)}. + */ + protected final ArrayList fieldList = new ArrayList(); + + protected Map> categoriesMap; + + /** + * Creating a facets document builder with default facet indexing + * parameters.
+ * See: + * {@link #CategoryDocumentBuilder(TaxonomyWriter, FacetIndexingParams)} + * + * @param taxonomyWriter + * to which new categories will be added, as well as translating + * known categories to ordinals + * @throws IOException + * + */ + public CategoryDocumentBuilder(TaxonomyWriter taxonomyWriter) + throws IOException { + this(taxonomyWriter, new DefaultFacetIndexingParams()); + } + + /** + * Creating a facets document builder with a given facet indexing parameters + * object.
+ * + * @param taxonomyWriter + * to which new categories will be added, as well as translating + * known categories to ordinals + * @param params + * holds all parameters the indexing process should use such as + * category-list parameters + * @throws IOException + */ + public CategoryDocumentBuilder(TaxonomyWriter taxonomyWriter, + FacetIndexingParams params) throws IOException { + this.taxonomyWriter = taxonomyWriter; + this.indexingParams = params; + this.categoriesMap = new HashMap>(); + } + + /** + * Set the categories of the document builder from an {@link Iterable} of + * {@link CategoryPath} objects. + * + * @param categoryPaths + * An iterable of CategoryPath objects which holds the categories + * (facets) which will be added to the document at + * {@link #build(Document)} + * @return This CategoryDocumentBuilder, to enable this one line call: + * {@code new} {@link #CategoryDocumentBuilder(TaxonomyWriter)}. + * {@link #setCategoryPaths(Iterable)}.{@link #build(Document)}. + * @throws IOException + */ + public CategoryDocumentBuilder setCategoryPaths( + Iterable categoryPaths) throws IOException { + if (categoryPaths == null) { + fieldList.clear(); + return this; + } + return setCategories(new CategoryAttributesIterable(categoryPaths)); + } + + /** + * Set the categories of the document builder from an {@link Iterable} of + * {@link CategoryAttribute} objects. + * + * @param categories + * An iterable of {@link CategoryAttribute} objects which holds + * the categories (facets) which will be added to the document at + * {@link #build(Document)} + * @return This CategoryDocumentBuilder, to enable this one line call: + * {@code new} {@link #CategoryDocumentBuilder(TaxonomyWriter)}. + * {@link #setCategories(Iterable)}.{@link #build(Document)}. + * @throws IOException + */ + public CategoryDocumentBuilder setCategories( + Iterable categories) throws IOException { + fieldList.clear(); + if (categories == null) { + return this; + } + + // get field-name to a list of facets mapping as different facets could + // be added to different category-lists on different fields + fillCategoriesMap(categories); + + // creates a different stream for each different field + for (Entry> e : categoriesMap + .entrySet()) { + // create a category attributes stream for the array of facets + CategoryAttributesStream categoryAttributesStream = new CategoryAttributesStream( + e.getValue()); + + // Set a suitable {@link TokenStream} using + // CategoryParentsStream, followed by CategoryListTokenizer and + // CategoryTokenizer composition (the ordering of the last two is + // not mandatory). + CategoryParentsStream parentsStream = (CategoryParentsStream) getParentsStream(categoryAttributesStream); + CategoryListTokenizer categoryListTokenizer = getCategoryListTokenizer(parentsStream); + CategoryTokenizer stream = getCategoryTokenizer(categoryListTokenizer); + + // Finally creating a suitable field with stream and adding it to a + // master field-list, used during the build process (see + // super.build()) + fieldList.add(new Field(e.getKey(), stream)); + } + + return this; + } + + /** + * Get a stream of categories which includes the parents, according to + * policies defined in indexing parameters. + * + * @param categoryAttributesStream + * The input stream + * @return The parents stream. + * @see OrdinalPolicy OrdinalPolicy (for policy of adding category tokens for parents) + * @see PathPolicy PathPolicy (for policy of adding category list tokens for parents) + */ + protected TokenStream getParentsStream( + CategoryAttributesStream categoryAttributesStream) { + return new CategoryParentsStream(categoryAttributesStream, + taxonomyWriter, indexingParams); + } + + /** + * Fills the categories mapping between a field name and a list of + * categories that belongs to it according to this builder's + * {@link FacetIndexingParams} object + * + * @param categories + * Iterable over the category attributes + */ + protected void fillCategoriesMap(Iterable categories) + throws IOException { + categoriesMap.clear(); + + // for-each category + for (CategoryAttribute category : categories) { + // extracting the field-name to which this category belongs + String fieldName = indexingParams.getCategoryListParams( + category.getCategoryPath()).getTerm().field(); + + // getting the list of categories which belongs to that field + List list = categoriesMap.get(fieldName); + + // if no such list exists + if (list == null) { + // adding a new one to the map + list = new ArrayList(); + categoriesMap.put(fieldName, list); + } + + // adding the new category to the list + list.add(category.clone()); + } + } + + /** + * Get a category list tokenizer (or a series of such tokenizers) to create + * the category list tokens. + * + * @param categoryStream + * A stream containing {@link CategoryAttribute} with the + * relevant data. + * @return The category list tokenizer (or series of tokenizers) to be used + * in creating category list tokens. + */ + protected CategoryListTokenizer getCategoryListTokenizer( + TokenStream categoryStream) { + return getCountingListTokenizer(categoryStream); + } + + /** + * Get a {@link CountingListTokenizer} for creating counting list token. + * + * @param categoryStream + * A stream containing {@link CategoryAttribute}s with the + * relevant data. + * @return A counting list tokenizer to be used in creating counting list + * token. + */ + protected CountingListTokenizer getCountingListTokenizer( + TokenStream categoryStream) { + return new CountingListTokenizer(categoryStream, indexingParams); + } + + /** + * Get a {@link CategoryTokenizer} to create the category tokens. + * This method can be overridden for adding more attributes to the category + * tokens. + * + * @param categoryStream + * A stream containing {@link CategoryAttribute} with the + * relevant data. + * @return The {@link CategoryTokenizer} to be used in creating category + * tokens. + * @throws IOException + */ + protected CategoryTokenizer getCategoryTokenizer(TokenStream categoryStream) + throws IOException { + return new CategoryTokenizer(categoryStream, indexingParams); + } + + /** Adds the fields created in one of the "set" methods to the document */ + public Document build(Document doc) { + for (Field f : fieldList) { + f.setOmitNorms(true); + doc.add(f); + } + return doc; + } + +}