+++ /dev/null
-package org.apache.lucene.analysis;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
-import org.apache.lucene.analysis.WhitespaceAnalyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TermRangeFilter;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.search.TermRangeQuery;
-import org.apache.lucene.search.Searcher;
-import org.apache.lucene.search.Sort;
-import org.apache.lucene.search.SortField;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.util.IndexableBinaryStringTools;
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util._TestUtil;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.HashMap;
-import java.util.Map;
-
-public abstract class CollationTestBase extends LuceneTestCase {
-
- protected String firstRangeBeginningOriginal = "\u062F";
- protected String firstRangeEndOriginal = "\u0698";
-
- protected String secondRangeBeginningOriginal = "\u0633";
- protected String secondRangeEndOriginal = "\u0638";
-
- /**
- * Convenience method to perform the same function as CollationKeyFilter.
- *
- * @param keyBits the result from
- * collator.getCollationKey(original).toByteArray()
- * @return The encoded collation key for the original String
- */
- protected String encodeCollationKey(byte[] keyBits) {
- // Ensure that the backing char[] array is large enough to hold the encoded
- // Binary String
- int encodedLength = IndexableBinaryStringTools.getEncodedLength(keyBits, 0, keyBits.length);
- char[] encodedBegArray = new char[encodedLength];
- IndexableBinaryStringTools.encode(keyBits, 0, keyBits.length, encodedBegArray, 0, encodedLength);
- return new String(encodedBegArray);
- }
-
- public void testFarsiRangeFilterCollating(Analyzer analyzer, String firstBeg,
- String firstEnd, String secondBeg,
- String secondEnd) throws Exception {
- RAMDirectory ramDir = new RAMDirectory();
- IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
- TEST_VERSION_CURRENT, analyzer));
- Document doc = new Document();
- doc.add(new Field("content", "\u0633\u0627\u0628",
- Field.Store.YES, Field.Index.ANALYZED));
- doc.add(new Field("body", "body",
- Field.Store.YES, Field.Index.NOT_ANALYZED));
- writer.addDocument(doc);
- writer.close();
- IndexSearcher searcher = new IndexSearcher(ramDir, true);
- Query query = new TermQuery(new Term("body","body"));
-
- // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
- // orders the U+0698 character before the U+0633 character, so the single
- // index Term below should NOT be returned by a TermRangeFilter with a Farsi
- // Collator (or an Arabic one for the case when Farsi searcher not
- // supported).
- ScoreDoc[] result = searcher.search
- (query, new TermRangeFilter("content", firstBeg, firstEnd, true, true), 1).scoreDocs;
- assertEquals("The index Term should not be included.", 0, result.length);
-
- result = searcher.search
- (query, new TermRangeFilter("content", secondBeg, secondEnd, true, true), 1).scoreDocs;
- assertEquals("The index Term should be included.", 1, result.length);
-
- searcher.close();
- }
-
- public void testFarsiRangeQueryCollating(Analyzer analyzer, String firstBeg,
- String firstEnd, String secondBeg,
- String secondEnd) throws Exception {
- RAMDirectory ramDir = new RAMDirectory();
- IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
- TEST_VERSION_CURRENT, analyzer));
- Document doc = new Document();
-
- // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
- // orders the U+0698 character before the U+0633 character, so the single
- // index Term below should NOT be returned by a TermRangeQuery with a Farsi
- // Collator (or an Arabic one for the case when Farsi is not supported).
- doc.add(new Field("content", "\u0633\u0627\u0628",
- Field.Store.YES, Field.Index.ANALYZED));
- writer.addDocument(doc);
- writer.close();
- IndexSearcher searcher = new IndexSearcher(ramDir, true);
-
- Query query = new TermRangeQuery("content", firstBeg, firstEnd, true, true);
- ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals("The index Term should not be included.", 0, hits.length);
-
- query = new TermRangeQuery("content", secondBeg, secondEnd, true, true);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals("The index Term should be included.", 1, hits.length);
- searcher.close();
- }
-
- public void testFarsiTermRangeQuery(Analyzer analyzer, String firstBeg,
- String firstEnd, String secondBeg, String secondEnd) throws Exception {
-
- RAMDirectory farsiIndex = new RAMDirectory();
- IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(
- TEST_VERSION_CURRENT, analyzer));
- Document doc = new Document();
- doc.add(new Field("content", "\u0633\u0627\u0628",
- Field.Store.YES, Field.Index.ANALYZED));
- doc.add(new Field("body", "body",
- Field.Store.YES, Field.Index.NOT_ANALYZED));
- writer.addDocument(doc);
- writer.close();
-
- IndexReader reader = IndexReader.open(farsiIndex, true);
- IndexSearcher search = newSearcher(reader);
-
- // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
- // orders the U+0698 character before the U+0633 character, so the single
- // index Term below should NOT be returned by a TermRangeQuery
- // with a Farsi Collator (or an Arabic one for the case when Farsi is
- // not supported).
- Query csrq
- = new TermRangeQuery("content", firstBeg, firstEnd, true, true);
- ScoreDoc[] result = search.search(csrq, null, 1000).scoreDocs;
- assertEquals("The index Term should not be included.", 0, result.length);
-
- csrq = new TermRangeQuery
- ("content", secondBeg, secondEnd, true, true);
- result = search.search(csrq, null, 1000).scoreDocs;
- assertEquals("The index Term should be included.", 1, result.length);
- search.close();
- }
-
- // Test using various international locales with accented characters (which
- // sort differently depending on locale)
- //
- // Copied (and slightly modified) from
- // org.apache.lucene.search.TestSort.testInternationalSort()
- //
- // TODO: this test is really fragile. there are already 3 different cases,
- // depending upon unicode version.
- public void testCollationKeySort(Analyzer usAnalyzer,
- Analyzer franceAnalyzer,
- Analyzer swedenAnalyzer,
- Analyzer denmarkAnalyzer,
- String usResult,
- String frResult,
- String svResult,
- String dkResult) throws Exception {
- RAMDirectory indexStore = new RAMDirectory();
- PerFieldAnalyzerWrapper analyzer
- = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
- analyzer.addAnalyzer("US", usAnalyzer);
- analyzer.addAnalyzer("France", franceAnalyzer);
- analyzer.addAnalyzer("Sweden", swedenAnalyzer);
- analyzer.addAnalyzer("Denmark", denmarkAnalyzer);
- IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(
- TEST_VERSION_CURRENT, analyzer));
-
- // document data:
- // the tracer field is used to determine which document was hit
- String[][] sortData = new String[][] {
- // tracer contents US France Sweden (sv_SE) Denmark (da_DK)
- { "A", "x", "p\u00EAche", "p\u00EAche", "p\u00EAche", "p\u00EAche" },
- { "B", "y", "HAT", "HAT", "HAT", "HAT" },
- { "C", "x", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9" },
- { "D", "y", "HUT", "HUT", "HUT", "HUT" },
- { "E", "x", "peach", "peach", "peach", "peach" },
- { "F", "y", "H\u00C5T", "H\u00C5T", "H\u00C5T", "H\u00C5T" },
- { "G", "x", "sin", "sin", "sin", "sin" },
- { "H", "y", "H\u00D8T", "H\u00D8T", "H\u00D8T", "H\u00D8T" },
- { "I", "x", "s\u00EDn", "s\u00EDn", "s\u00EDn", "s\u00EDn" },
- { "J", "y", "HOT", "HOT", "HOT", "HOT" },
- };
-
- for (int i = 0 ; i < sortData.length ; ++i) {
- Document doc = new Document();
- doc.add(new Field("tracer", sortData[i][0],
- Field.Store.YES, Field.Index.NO));
- doc.add(new Field("contents", sortData[i][1],
- Field.Store.NO, Field.Index.ANALYZED));
- if (sortData[i][2] != null)
- doc.add(new Field("US", sortData[i][2],
- Field.Store.NO, Field.Index.ANALYZED));
- if (sortData[i][3] != null)
- doc.add(new Field("France", sortData[i][3],
- Field.Store.NO, Field.Index.ANALYZED));
- if (sortData[i][4] != null)
- doc.add(new Field("Sweden", sortData[i][4],
- Field.Store.NO, Field.Index.ANALYZED));
- if (sortData[i][5] != null)
- doc.add(new Field("Denmark", sortData[i][5],
- Field.Store.NO, Field.Index.ANALYZED));
- writer.addDocument(doc);
- }
- writer.optimize();
- writer.close();
- Searcher searcher = new IndexSearcher(indexStore, true);
-
- Sort sort = new Sort();
- Query queryX = new TermQuery(new Term ("contents", "x"));
- Query queryY = new TermQuery(new Term ("contents", "y"));
-
- sort.setSort(new SortField("US", SortField.STRING));
- assertMatches(searcher, queryY, sort, usResult);
-
- sort.setSort(new SortField("France", SortField.STRING));
- assertMatches(searcher, queryX, sort, frResult);
-
- sort.setSort(new SortField("Sweden", SortField.STRING));
- assertMatches(searcher, queryY, sort, svResult);
-
- sort.setSort(new SortField("Denmark", SortField.STRING));
- assertMatches(searcher, queryY, sort, dkResult);
- }
-
- // Make sure the documents returned by the search match the expected list
- // Copied from TestSort.java
- private void assertMatches(Searcher searcher, Query query, Sort sort,
- String expectedResult) throws IOException {
- ScoreDoc[] result = searcher.search(query, null, 1000, sort).scoreDocs;
- StringBuilder buff = new StringBuilder(10);
- int n = result.length;
- for (int i = 0 ; i < n ; ++i) {
- Document doc = searcher.doc(result[i].doc);
- String[] v = doc.getValues("tracer");
- for (int j = 0 ; j < v.length ; ++j) {
- buff.append(v[j]);
- }
- }
- assertEquals(expectedResult, buff.toString());
- }
-
- public void assertThreadSafe(final Analyzer analyzer) throws Exception {
- int numTestPoints = 100;
- int numThreads = _TestUtil.nextInt(random, 3, 5);
- final HashMap<String,String> map = new HashMap<String,String>();
-
- // create a map<String,SortKey> up front.
- // then with multiple threads, generate sort keys for all the keys in the map
- // and ensure they are the same as the ones we produced in serial fashion.
-
- for (int i = 0; i < numTestPoints; i++) {
- String term = _TestUtil.randomSimpleString(random);
- TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
- CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
- ts.reset();
- assertTrue(ts.incrementToken());
- // ensure we make a copy of the actual bytes too
- map.put(term, encodedBytes.toString());
- }
-
- Thread threads[] = new Thread[numThreads];
- for (int i = 0; i < numThreads; i++) {
- threads[i] = new Thread() {
- @Override
- public void run() {
- try {
- for (Map.Entry<String,String> mapping : map.entrySet()) {
- String term = mapping.getKey();
- String expected = mapping.getValue();
- TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
- CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
- ts.reset();
- assertTrue(ts.incrementToken());
- assertEquals(expected, encodedBytes.toString());
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
- };
- }
- for (int i = 0; i < numThreads; i++) {
- threads[i].start();
- }
- for (int i = 0; i < numThreads; i++) {
- threads[i].join();
- }
- }
-}