1 package org.apache.lucene.collation;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import org.apache.lucene.analysis.Analyzer;
22 import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
23 import org.apache.lucene.analysis.WhitespaceAnalyzer;
24 import org.apache.lucene.analysis.TokenStream;
25 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
26 import org.apache.lucene.store.RAMDirectory;
27 import org.apache.lucene.index.IndexWriter;
28 import org.apache.lucene.index.IndexWriterConfig;
29 import org.apache.lucene.index.Term;
30 import org.apache.lucene.index.IndexReader;
31 import org.apache.lucene.search.IndexSearcher;
32 import org.apache.lucene.search.ScoreDoc;
33 import org.apache.lucene.search.Query;
34 import org.apache.lucene.search.TermRangeFilter;
35 import org.apache.lucene.search.TermQuery;
36 import org.apache.lucene.search.TermRangeQuery;
37 import org.apache.lucene.search.Searcher;
38 import org.apache.lucene.search.Sort;
39 import org.apache.lucene.search.SortField;
40 import org.apache.lucene.document.Field;
41 import org.apache.lucene.document.Document;
42 import org.apache.lucene.util.IndexableBinaryStringTools;
43 import org.apache.lucene.util.LuceneTestCase;
44 import org.apache.lucene.util._TestUtil;
46 import java.io.IOException;
47 import java.io.StringReader;
48 import java.util.HashMap;
51 public abstract class CollationTestBase extends LuceneTestCase {
53 protected String firstRangeBeginningOriginal = "\u062F";
54 protected String firstRangeEndOriginal = "\u0698";
56 protected String secondRangeBeginningOriginal = "\u0633";
57 protected String secondRangeEndOriginal = "\u0638";
60 * Convenience method to perform the same function as CollationKeyFilter.
62 * @param keyBits the result from
63 * collator.getCollationKey(original).toByteArray()
64 * @return The encoded collation key for the original String
66 protected String encodeCollationKey(byte[] keyBits) {
67 // Ensure that the backing char[] array is large enough to hold the encoded
69 int encodedLength = IndexableBinaryStringTools.getEncodedLength(keyBits, 0, keyBits.length);
70 char[] encodedBegArray = new char[encodedLength];
71 IndexableBinaryStringTools.encode(keyBits, 0, keyBits.length, encodedBegArray, 0, encodedLength);
72 return new String(encodedBegArray);
75 public void testFarsiRangeFilterCollating(Analyzer analyzer, String firstBeg,
76 String firstEnd, String secondBeg,
77 String secondEnd) throws Exception {
78 RAMDirectory ramDir = new RAMDirectory();
79 IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
80 TEST_VERSION_CURRENT, analyzer));
81 Document doc = new Document();
82 doc.add(new Field("content", "\u0633\u0627\u0628",
83 Field.Store.YES, Field.Index.ANALYZED));
84 doc.add(new Field("body", "body",
85 Field.Store.YES, Field.Index.NOT_ANALYZED));
86 writer.addDocument(doc);
88 IndexSearcher searcher = new IndexSearcher(ramDir, true);
89 Query query = new TermQuery(new Term("body","body"));
91 // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
92 // orders the U+0698 character before the U+0633 character, so the single
93 // index Term below should NOT be returned by a TermRangeFilter with a Farsi
94 // Collator (or an Arabic one for the case when Farsi searcher not
96 ScoreDoc[] result = searcher.search
97 (query, new TermRangeFilter("content", firstBeg, firstEnd, true, true), 1).scoreDocs;
98 assertEquals("The index Term should not be included.", 0, result.length);
100 result = searcher.search
101 (query, new TermRangeFilter("content", secondBeg, secondEnd, true, true), 1).scoreDocs;
102 assertEquals("The index Term should be included.", 1, result.length);
107 public void testFarsiRangeQueryCollating(Analyzer analyzer, String firstBeg,
108 String firstEnd, String secondBeg,
109 String secondEnd) throws Exception {
110 RAMDirectory ramDir = new RAMDirectory();
111 IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
112 TEST_VERSION_CURRENT, analyzer));
113 Document doc = new Document();
115 // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
116 // orders the U+0698 character before the U+0633 character, so the single
117 // index Term below should NOT be returned by a TermRangeQuery with a Farsi
118 // Collator (or an Arabic one for the case when Farsi is not supported).
119 doc.add(new Field("content", "\u0633\u0627\u0628",
120 Field.Store.YES, Field.Index.ANALYZED));
121 writer.addDocument(doc);
123 IndexSearcher searcher = new IndexSearcher(ramDir, true);
125 Query query = new TermRangeQuery("content", firstBeg, firstEnd, true, true);
126 ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
127 assertEquals("The index Term should not be included.", 0, hits.length);
129 query = new TermRangeQuery("content", secondBeg, secondEnd, true, true);
130 hits = searcher.search(query, null, 1000).scoreDocs;
131 assertEquals("The index Term should be included.", 1, hits.length);
135 public void testFarsiTermRangeQuery(Analyzer analyzer, String firstBeg,
136 String firstEnd, String secondBeg, String secondEnd) throws Exception {
138 RAMDirectory farsiIndex = new RAMDirectory();
139 IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(
140 TEST_VERSION_CURRENT, analyzer));
141 Document doc = new Document();
142 doc.add(new Field("content", "\u0633\u0627\u0628",
143 Field.Store.YES, Field.Index.ANALYZED));
144 doc.add(new Field("body", "body",
145 Field.Store.YES, Field.Index.NOT_ANALYZED));
146 writer.addDocument(doc);
149 IndexReader reader = IndexReader.open(farsiIndex, true);
150 IndexSearcher search = newSearcher(reader);
152 // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
153 // orders the U+0698 character before the U+0633 character, so the single
154 // index Term below should NOT be returned by a TermRangeQuery
155 // with a Farsi Collator (or an Arabic one for the case when Farsi is
158 = new TermRangeQuery("content", firstBeg, firstEnd, true, true);
159 ScoreDoc[] result = search.search(csrq, null, 1000).scoreDocs;
160 assertEquals("The index Term should not be included.", 0, result.length);
162 csrq = new TermRangeQuery
163 ("content", secondBeg, secondEnd, true, true);
164 result = search.search(csrq, null, 1000).scoreDocs;
165 assertEquals("The index Term should be included.", 1, result.length);
169 // Test using various international locales with accented characters (which
170 // sort differently depending on locale)
172 // Copied (and slightly modified) from
173 // org.apache.lucene.search.TestSort.testInternationalSort()
175 // TODO: this test is really fragile. there are already 3 different cases,
176 // depending upon unicode version.
177 public void testCollationKeySort(Analyzer usAnalyzer,
178 Analyzer franceAnalyzer,
179 Analyzer swedenAnalyzer,
180 Analyzer denmarkAnalyzer,
184 String dkResult) throws Exception {
185 RAMDirectory indexStore = new RAMDirectory();
186 PerFieldAnalyzerWrapper analyzer
187 = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
188 analyzer.addAnalyzer("US", usAnalyzer);
189 analyzer.addAnalyzer("France", franceAnalyzer);
190 analyzer.addAnalyzer("Sweden", swedenAnalyzer);
191 analyzer.addAnalyzer("Denmark", denmarkAnalyzer);
192 IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(
193 TEST_VERSION_CURRENT, analyzer));
196 // the tracer field is used to determine which document was hit
197 String[][] sortData = new String[][] {
198 // tracer contents US France Sweden (sv_SE) Denmark (da_DK)
199 { "A", "x", "p\u00EAche", "p\u00EAche", "p\u00EAche", "p\u00EAche" },
200 { "B", "y", "HAT", "HAT", "HAT", "HAT" },
201 { "C", "x", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9" },
202 { "D", "y", "HUT", "HUT", "HUT", "HUT" },
203 { "E", "x", "peach", "peach", "peach", "peach" },
204 { "F", "y", "H\u00C5T", "H\u00C5T", "H\u00C5T", "H\u00C5T" },
205 { "G", "x", "sin", "sin", "sin", "sin" },
206 { "H", "y", "H\u00D8T", "H\u00D8T", "H\u00D8T", "H\u00D8T" },
207 { "I", "x", "s\u00EDn", "s\u00EDn", "s\u00EDn", "s\u00EDn" },
208 { "J", "y", "HOT", "HOT", "HOT", "HOT" },
211 for (int i = 0 ; i < sortData.length ; ++i) {
212 Document doc = new Document();
213 doc.add(new Field("tracer", sortData[i][0],
214 Field.Store.YES, Field.Index.NO));
215 doc.add(new Field("contents", sortData[i][1],
216 Field.Store.NO, Field.Index.ANALYZED));
217 if (sortData[i][2] != null)
218 doc.add(new Field("US", sortData[i][2],
219 Field.Store.NO, Field.Index.ANALYZED));
220 if (sortData[i][3] != null)
221 doc.add(new Field("France", sortData[i][3],
222 Field.Store.NO, Field.Index.ANALYZED));
223 if (sortData[i][4] != null)
224 doc.add(new Field("Sweden", sortData[i][4],
225 Field.Store.NO, Field.Index.ANALYZED));
226 if (sortData[i][5] != null)
227 doc.add(new Field("Denmark", sortData[i][5],
228 Field.Store.NO, Field.Index.ANALYZED));
229 writer.addDocument(doc);
233 Searcher searcher = new IndexSearcher(indexStore, true);
235 Sort sort = new Sort();
236 Query queryX = new TermQuery(new Term ("contents", "x"));
237 Query queryY = new TermQuery(new Term ("contents", "y"));
239 sort.setSort(new SortField("US", SortField.STRING));
240 assertMatches(searcher, queryY, sort, usResult);
242 sort.setSort(new SortField("France", SortField.STRING));
243 assertMatches(searcher, queryX, sort, frResult);
245 sort.setSort(new SortField("Sweden", SortField.STRING));
246 assertMatches(searcher, queryY, sort, svResult);
248 sort.setSort(new SortField("Denmark", SortField.STRING));
249 assertMatches(searcher, queryY, sort, dkResult);
252 // Make sure the documents returned by the search match the expected list
253 // Copied from TestSort.java
254 private void assertMatches(Searcher searcher, Query query, Sort sort,
255 String expectedResult) throws IOException {
256 ScoreDoc[] result = searcher.search(query, null, 1000, sort).scoreDocs;
257 StringBuilder buff = new StringBuilder(10);
258 int n = result.length;
259 for (int i = 0 ; i < n ; ++i) {
260 Document doc = searcher.doc(result[i].doc);
261 String[] v = doc.getValues("tracer");
262 for (int j = 0 ; j < v.length ; ++j) {
266 assertEquals(expectedResult, buff.toString());
269 public void assertThreadSafe(final Analyzer analyzer) throws Exception {
270 int numTestPoints = 100;
271 int numThreads = _TestUtil.nextInt(random, 3, 5);
272 final HashMap<String,String> map = new HashMap<String,String>();
274 // create a map<String,SortKey> up front.
275 // then with multiple threads, generate sort keys for all the keys in the map
276 // and ensure they are the same as the ones we produced in serial fashion.
278 for (int i = 0; i < numTestPoints; i++) {
279 String term = _TestUtil.randomSimpleString(random);
280 TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
281 CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
283 assertTrue(ts.incrementToken());
284 // ensure we make a copy of the actual bytes too
285 map.put(term, encodedBytes.toString());
288 Thread threads[] = new Thread[numThreads];
289 for (int i = 0; i < numThreads; i++) {
290 threads[i] = new Thread() {
294 for (Map.Entry<String,String> mapping : map.entrySet()) {
295 String term = mapping.getKey();
296 String expected = mapping.getValue();
297 TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
298 CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
300 assertTrue(ts.incrementToken());
301 assertEquals(expected, encodedBytes.toString());
303 } catch (IOException e) {
304 throw new RuntimeException(e);
309 for (int i = 0; i < numThreads; i++) {
312 for (int i = 0; i < numThreads; i++) {