1 package org.apache.lucene.facet.index;
3 import java.io.IOException;
4 import java.util.ArrayList;
5 import java.util.HashMap;
8 import java.util.Map.Entry;
10 import org.apache.lucene.analysis.TokenStream;
11 import org.apache.lucene.document.Document;
12 import org.apache.lucene.document.Field;
14 import org.apache.lucene.DocumentBuilder;
15 import org.apache.lucene.facet.index.attributes.CategoryAttribute;
16 import org.apache.lucene.facet.index.attributes.CategoryAttributesIterable;
17 import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy;
18 import org.apache.lucene.facet.index.categorypolicy.PathPolicy;
19 import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams;
20 import org.apache.lucene.facet.index.params.FacetIndexingParams;
21 import org.apache.lucene.facet.index.streaming.CategoryAttributesStream;
22 import org.apache.lucene.facet.index.streaming.CategoryListTokenizer;
23 import org.apache.lucene.facet.index.streaming.CategoryParentsStream;
24 import org.apache.lucene.facet.index.streaming.CategoryTokenizer;
25 import org.apache.lucene.facet.index.streaming.CountingListTokenizer;
26 import org.apache.lucene.facet.taxonomy.CategoryPath;
27 import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
30 * Licensed to the Apache Software Foundation (ASF) under one or more
31 * contributor license agreements. See the NOTICE file distributed with
32 * this work for additional information regarding copyright ownership.
33 * The ASF licenses this file to You under the Apache License, Version 2.0
34 * (the "License"); you may not use this file except in compliance with
35 * the License. You may obtain a copy of the License at
37 * http://www.apache.org/licenses/LICENSE-2.0
39 * Unless required by applicable law or agreed to in writing, software
40 * distributed under the License is distributed on an "AS IS" BASIS,
41 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
42 * See the License for the specific language governing permissions and
43 * limitations under the License.
47 * A utility class which allows attachment of {@link CategoryPath}s or
48 * {@link CategoryAttribute}s to a given document using a taxonomy.<br>
49 * Construction could be done with either a given {@link FacetIndexingParams} or
50 * the default implementation {@link DefaultFacetIndexingParams}.<br>
51 * A CategoryDocumentBuilder can be reused by repeatedly setting the categories
52 * and building the document. Categories are provided either as
53 * {@link CategoryAttribute} elements through {@link #setCategories(Iterable)},
54 * or as {@link CategoryPath} elements through
55 * {@link #setCategoryPaths(Iterable)}.
57 * Note that both {@link #setCategories(Iterable)} and
58 * {@link #setCategoryPaths(Iterable)} return this
59 * {@link CategoryDocumentBuilder}, allowing the following pattern: {@code new
60 * CategoryDocumentBuilder(taxonomy,
61 * params).setCategories(categories).build(doc)}.
63 * @lucene.experimental
65 public class CategoryDocumentBuilder implements DocumentBuilder {
68 * A {@link TaxonomyWriter} for adding categories and retrieving their
71 protected final TaxonomyWriter taxonomyWriter;
74 * Parameters to be used when indexing categories.
76 protected final FacetIndexingParams indexingParams;
79 * A list of fields which is filled at ancestors' construction and used
80 * during {@link CategoryDocumentBuilder#build(Document)}.
82 protected final ArrayList<Field> fieldList = new ArrayList<Field>();
84 protected Map<String, List<CategoryAttribute>> categoriesMap;
87 * Creating a facets document builder with default facet indexing
90 * {@link #CategoryDocumentBuilder(TaxonomyWriter, FacetIndexingParams)}
92 * @param taxonomyWriter
93 * to which new categories will be added, as well as translating
94 * known categories to ordinals
98 public CategoryDocumentBuilder(TaxonomyWriter taxonomyWriter)
100 this(taxonomyWriter, new DefaultFacetIndexingParams());
104 * Creating a facets document builder with a given facet indexing parameters
107 * @param taxonomyWriter
108 * to which new categories will be added, as well as translating
109 * known categories to ordinals
111 * holds all parameters the indexing process should use such as
112 * category-list parameters
113 * @throws IOException
115 public CategoryDocumentBuilder(TaxonomyWriter taxonomyWriter,
116 FacetIndexingParams params) throws IOException {
117 this.taxonomyWriter = taxonomyWriter;
118 this.indexingParams = params;
119 this.categoriesMap = new HashMap<String, List<CategoryAttribute>>();
123 * Set the categories of the document builder from an {@link Iterable} of
124 * {@link CategoryPath} objects.
126 * @param categoryPaths
127 * An iterable of CategoryPath objects which holds the categories
128 * (facets) which will be added to the document at
129 * {@link #build(Document)}
130 * @return This CategoryDocumentBuilder, to enable this one line call:
131 * {@code new} {@link #CategoryDocumentBuilder(TaxonomyWriter)}.
132 * {@link #setCategoryPaths(Iterable)}.{@link #build(Document)}.
133 * @throws IOException
135 public CategoryDocumentBuilder setCategoryPaths(
136 Iterable<CategoryPath> categoryPaths) throws IOException {
137 if (categoryPaths == null) {
141 return setCategories(new CategoryAttributesIterable(categoryPaths));
145 * Set the categories of the document builder from an {@link Iterable} of
146 * {@link CategoryAttribute} objects.
149 * An iterable of {@link CategoryAttribute} objects which holds
150 * the categories (facets) which will be added to the document at
151 * {@link #build(Document)}
152 * @return This CategoryDocumentBuilder, to enable this one line call:
153 * {@code new} {@link #CategoryDocumentBuilder(TaxonomyWriter)}.
154 * {@link #setCategories(Iterable)}.{@link #build(Document)}.
155 * @throws IOException
157 public CategoryDocumentBuilder setCategories(
158 Iterable<CategoryAttribute> categories) throws IOException {
160 if (categories == null) {
164 // get field-name to a list of facets mapping as different facets could
165 // be added to different category-lists on different fields
166 fillCategoriesMap(categories);
168 // creates a different stream for each different field
169 for (Entry<String, List<CategoryAttribute>> e : categoriesMap
171 // create a category attributes stream for the array of facets
172 CategoryAttributesStream categoryAttributesStream = new CategoryAttributesStream(
175 // Set a suitable {@link TokenStream} using
176 // CategoryParentsStream, followed by CategoryListTokenizer and
177 // CategoryTokenizer composition (the ordering of the last two is
179 CategoryParentsStream parentsStream = (CategoryParentsStream) getParentsStream(categoryAttributesStream);
180 CategoryListTokenizer categoryListTokenizer = getCategoryListTokenizer(parentsStream);
181 CategoryTokenizer stream = getCategoryTokenizer(categoryListTokenizer);
183 // Finally creating a suitable field with stream and adding it to a
184 // master field-list, used during the build process (see
186 fieldList.add(new Field(e.getKey(), stream));
193 * Get a stream of categories which includes the parents, according to
194 * policies defined in indexing parameters.
196 * @param categoryAttributesStream
198 * @return The parents stream.
199 * @see OrdinalPolicy OrdinalPolicy (for policy of adding category tokens for parents)
200 * @see PathPolicy PathPolicy (for policy of adding category <b>list</b> tokens for parents)
202 protected TokenStream getParentsStream(
203 CategoryAttributesStream categoryAttributesStream) {
204 return new CategoryParentsStream(categoryAttributesStream,
205 taxonomyWriter, indexingParams);
209 * Fills the categories mapping between a field name and a list of
210 * categories that belongs to it according to this builder's
211 * {@link FacetIndexingParams} object
214 * Iterable over the category attributes
216 protected void fillCategoriesMap(Iterable<CategoryAttribute> categories)
218 categoriesMap.clear();
221 for (CategoryAttribute category : categories) {
222 // extracting the field-name to which this category belongs
223 String fieldName = indexingParams.getCategoryListParams(
224 category.getCategoryPath()).getTerm().field();
226 // getting the list of categories which belongs to that field
227 List<CategoryAttribute> list = categoriesMap.get(fieldName);
229 // if no such list exists
231 // adding a new one to the map
232 list = new ArrayList<CategoryAttribute>();
233 categoriesMap.put(fieldName, list);
236 // adding the new category to the list
237 list.add(category.clone());
242 * Get a category list tokenizer (or a series of such tokenizers) to create
243 * the <b>category list tokens</b>.
245 * @param categoryStream
246 * A stream containing {@link CategoryAttribute} with the
248 * @return The category list tokenizer (or series of tokenizers) to be used
249 * in creating category list tokens.
251 protected CategoryListTokenizer getCategoryListTokenizer(
252 TokenStream categoryStream) {
253 return getCountingListTokenizer(categoryStream);
257 * Get a {@link CountingListTokenizer} for creating counting list token.
259 * @param categoryStream
260 * A stream containing {@link CategoryAttribute}s with the
262 * @return A counting list tokenizer to be used in creating counting list
265 protected CountingListTokenizer getCountingListTokenizer(
266 TokenStream categoryStream) {
267 return new CountingListTokenizer(categoryStream, indexingParams);
271 * Get a {@link CategoryTokenizer} to create the <b>category tokens</b>.
272 * This method can be overridden for adding more attributes to the category
275 * @param categoryStream
276 * A stream containing {@link CategoryAttribute} with the
278 * @return The {@link CategoryTokenizer} to be used in creating category
280 * @throws IOException
282 protected CategoryTokenizer getCategoryTokenizer(TokenStream categoryStream)
284 return new CategoryTokenizer(categoryStream, indexingParams);
288 * Adds the fields created in one of the "set" methods to the document
290 public Document build(Document doc) {
291 for (Field f : fieldList) {
292 f.setOmitNorms(true);