lucene-java-3.5.0/lucene/src/test-framework/java/org/apache/lucene/analysis/CollationTestBase.java

   1 package org.apache.lucene.analysis;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20
  21 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  22 import org.apache.lucene.store.RAMDirectory;
  23 import org.apache.lucene.index.IndexWriter;
  24 import org.apache.lucene.index.IndexWriterConfig;
  25 import org.apache.lucene.index.Term;
  26 import org.apache.lucene.index.IndexReader;
  27 import org.apache.lucene.search.IndexSearcher;
  28 import org.apache.lucene.search.ScoreDoc;
  29 import org.apache.lucene.search.Query;
  30 import org.apache.lucene.search.TermRangeFilter;
  31 import org.apache.lucene.search.TermQuery;
  32 import org.apache.lucene.search.TermRangeQuery;
  33 import org.apache.lucene.search.Searcher;
  34 import org.apache.lucene.search.Sort;
  35 import org.apache.lucene.search.SortField;
  36 import org.apache.lucene.document.Field;
  37 import org.apache.lucene.document.Document;
  38 import org.apache.lucene.util.IndexableBinaryStringTools;
  39 import org.apache.lucene.util.LuceneTestCase;
  40 import org.apache.lucene.util._TestUtil;
  41
  42 import java.io.IOException;
  43 import java.io.StringReader;
  44 import java.util.HashMap;
  45 import java.util.Map;
  46
  47 public abstract class CollationTestBase extends LuceneTestCase {
  48
  49   protected String firstRangeBeginningOriginal = "\u062F";
  50   protected String firstRangeEndOriginal = "\u0698";
  51
  52   protected String secondRangeBeginningOriginal = "\u0633";
  53   protected String secondRangeEndOriginal = "\u0638";
  54
  55   /**
  56    * Convenience method to perform the same function as CollationKeyFilter.
  57    *
  58    * @param keyBits the result from
  59    *  collator.getCollationKey(original).toByteArray()
  60    * @return The encoded collation key for the original String
  61    */
  62   protected String encodeCollationKey(byte[] keyBits) {
  63     // Ensure that the backing char[] array is large enough to hold the encoded
  64     // Binary String
  65     int encodedLength = IndexableBinaryStringTools.getEncodedLength(keyBits, 0, keyBits.length);
  66     char[] encodedBegArray = new char[encodedLength];
  67     IndexableBinaryStringTools.encode(keyBits, 0, keyBits.length, encodedBegArray, 0, encodedLength);
  68     return new String(encodedBegArray);
  69   }
  70
  71   public void testFarsiRangeFilterCollating(Analyzer analyzer, String firstBeg,
  72                                             String firstEnd, String secondBeg,
  73                                             String secondEnd) throws Exception {
  74     RAMDirectory ramDir = new RAMDirectory();
  75     IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
  76         TEST_VERSION_CURRENT, analyzer));
  77     Document doc = new Document();
  78     doc.add(new Field("content", "\u0633\u0627\u0628",
  79                       Field.Store.YES, Field.Index.ANALYZED));
  80     doc.add(new Field("body", "body",
  81                       Field.Store.YES, Field.Index.NOT_ANALYZED));
  82     writer.addDocument(doc);
  83     writer.close();
  84     IndexReader reader = IndexReader.open(ramDir);
  85     IndexSearcher searcher = new IndexSearcher(reader);
  86     Query query = new TermQuery(new Term("body","body"));
  87
  88     // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
  89     // orders the U+0698 character before the U+0633 character, so the single
  90     // index Term below should NOT be returned by a TermRangeFilter with a Farsi
  91     // Collator (or an Arabic one for the case when Farsi searcher not
  92     // supported).
  93     ScoreDoc[] result = searcher.search
  94       (query, new TermRangeFilter("content", firstBeg, firstEnd, true, true), 1).scoreDocs;
  95     assertEquals("The index Term should not be included.", 0, result.length);
  96
  97     result = searcher.search
  98       (query, new TermRangeFilter("content", secondBeg, secondEnd, true, true), 1).scoreDocs;
  99     assertEquals("The index Term should be included.", 1, result.length);
 100
 101     searcher.close();
 102     reader.close();
 103   }
 104
 105   public void testFarsiRangeQueryCollating(Analyzer analyzer, String firstBeg,
 106                                             String firstEnd, String secondBeg,
 107                                             String secondEnd) throws Exception {
 108     RAMDirectory ramDir = new RAMDirectory();
 109     IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
 110         TEST_VERSION_CURRENT, analyzer));
 111     Document doc = new Document();
 112
 113     // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
 114     // orders the U+0698 character before the U+0633 character, so the single
 115     // index Term below should NOT be returned by a TermRangeQuery with a Farsi
 116     // Collator (or an Arabic one for the case when Farsi is not supported).
 117     doc.add(new Field("content", "\u0633\u0627\u0628",
 118                       Field.Store.YES, Field.Index.ANALYZED));
 119     writer.addDocument(doc);
 120     writer.close();
 121     IndexReader reader = IndexReader.open(ramDir);
 122     IndexSearcher searcher = new IndexSearcher(reader);
 123
 124     Query query = new TermRangeQuery("content", firstBeg, firstEnd, true, true);
 125     ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
 126     assertEquals("The index Term should not be included.", 0, hits.length);
 127
 128     query = new TermRangeQuery("content", secondBeg, secondEnd, true, true);
 129     hits = searcher.search(query, null, 1000).scoreDocs;
 130     assertEquals("The index Term should be included.", 1, hits.length);
 131     searcher.close();
 132     reader.close();
 133   }
 134
 135   public void testFarsiTermRangeQuery(Analyzer analyzer, String firstBeg,
 136       String firstEnd, String secondBeg, String secondEnd) throws Exception {
 137
 138     RAMDirectory farsiIndex = new RAMDirectory();
 139     IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(
 140         TEST_VERSION_CURRENT, analyzer));
 141     Document doc = new Document();
 142     doc.add(new Field("content", "\u0633\u0627\u0628",
 143                       Field.Store.YES, Field.Index.ANALYZED));
 144     doc.add(new Field("body", "body",
 145                       Field.Store.YES, Field.Index.NOT_ANALYZED));
 146     writer.addDocument(doc);
 147     writer.close();
 148
 149     IndexReader reader = IndexReader.open(farsiIndex, true);
 150     IndexSearcher search = newSearcher(reader);
 151
 152     // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
 153     // orders the U+0698 character before the U+0633 character, so the single
 154     // index Term below should NOT be returned by a TermRangeQuery
 155     // with a Farsi Collator (or an Arabic one for the case when Farsi is
 156     // not supported).
 157     Query csrq
 158       = new TermRangeQuery("content", firstBeg, firstEnd, true, true);
 159     ScoreDoc[] result = search.search(csrq, null, 1000).scoreDocs;
 160     assertEquals("The index Term should not be included.", 0, result.length);
 161
 162     csrq = new TermRangeQuery
 163       ("content", secondBeg, secondEnd, true, true);
 164     result = search.search(csrq, null, 1000).scoreDocs;
 165     assertEquals("The index Term should be included.", 1, result.length);
 166     search.close();
 167   }
 168
 169   // Test using various international locales with accented characters (which
 170   // sort differently depending on locale)
 171   //
 172   // Copied (and slightly modified) from
 173   // org.apache.lucene.search.TestSort.testInternationalSort()
 174   //
 175   // TODO: this test is really fragile. there are already 3 different cases,
 176   // depending upon unicode version.
 177   public void testCollationKeySort(Analyzer usAnalyzer,
 178                                    Analyzer franceAnalyzer,
 179                                    Analyzer swedenAnalyzer,
 180                                    Analyzer denmarkAnalyzer,
 181                                    String usResult,
 182                                    String frResult,
 183                                    String svResult,
 184                                    String dkResult) throws Exception {
 185     RAMDirectory indexStore = new RAMDirectory();
 186     PerFieldAnalyzerWrapper analyzer
 187       = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
 188     analyzer.addAnalyzer("US", usAnalyzer);
 189     analyzer.addAnalyzer("France", franceAnalyzer);
 190     analyzer.addAnalyzer("Sweden", swedenAnalyzer);
 191     analyzer.addAnalyzer("Denmark", denmarkAnalyzer);
 192     IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(
 193         TEST_VERSION_CURRENT, analyzer));
 194
 195     // document data:
 196     // the tracer field is used to determine which document was hit
 197     String[][] sortData = new String[][] {
 198       // tracer contents US                 France             Sweden (sv_SE)     Denmark (da_DK)
 199       {  "A",   "x",     "p\u00EAche",      "p\u00EAche",      "p\u00EAche",      "p\u00EAche"      },
 200       {  "B",   "y",     "HAT",             "HAT",             "HAT",             "HAT"             },
 201       {  "C",   "x",     "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9" },
 202       {  "D",   "y",     "HUT",             "HUT",             "HUT",             "HUT"             },
 203       {  "E",   "x",     "peach",           "peach",           "peach",           "peach"           },
 204       {  "F",   "y",     "H\u00C5T",        "H\u00C5T",        "H\u00C5T",        "H\u00C5T"        },
 205       {  "G",   "x",     "sin",             "sin",             "sin",             "sin"             },
 206       {  "H",   "y",     "H\u00D8T",        "H\u00D8T",        "H\u00D8T",        "H\u00D8T"        },
 207       {  "I",   "x",     "s\u00EDn",        "s\u00EDn",        "s\u00EDn",        "s\u00EDn"        },
 208       {  "J",   "y",     "HOT",             "HOT",             "HOT",             "HOT"             },
 209     };
 210
 211     for (int i = 0 ; i < sortData.length ; ++i) {
 212       Document doc = new Document();
 213       doc.add(new Field("tracer", sortData[i][0],
 214                         Field.Store.YES, Field.Index.NO));
 215       doc.add(new Field("contents", sortData[i][1],
 216                         Field.Store.NO, Field.Index.ANALYZED));
 217       if (sortData[i][2] != null)
 218         doc.add(new Field("US", sortData[i][2],
 219                           Field.Store.NO, Field.Index.ANALYZED));
 220       if (sortData[i][3] != null)
 221         doc.add(new Field("France", sortData[i][3],
 222                           Field.Store.NO, Field.Index.ANALYZED));
 223       if (sortData[i][4] != null)
 224         doc.add(new Field("Sweden", sortData[i][4],
 225                           Field.Store.NO, Field.Index.ANALYZED));
 226       if (sortData[i][5] != null)
 227         doc.add(new Field("Denmark", sortData[i][5],
 228                           Field.Store.NO, Field.Index.ANALYZED));
 229       writer.addDocument(doc);
 230     }
 231     writer.forceMerge(1);
 232     writer.close();
 233     IndexReader reader = IndexReader.open(indexStore);
 234     IndexSearcher searcher = new IndexSearcher(reader);
 235
 236     Sort sort = new Sort();
 237     Query queryX = new TermQuery(new Term ("contents", "x"));
 238     Query queryY = new TermQuery(new Term ("contents", "y"));
 239
 240     sort.setSort(new SortField("US", SortField.STRING));
 241     assertMatches(searcher, queryY, sort, usResult);
 242
 243     sort.setSort(new SortField("France", SortField.STRING));
 244     assertMatches(searcher, queryX, sort, frResult);
 245
 246     sort.setSort(new SortField("Sweden", SortField.STRING));
 247     assertMatches(searcher, queryY, sort, svResult);
 248
 249     sort.setSort(new SortField("Denmark", SortField.STRING));
 250     assertMatches(searcher, queryY, sort, dkResult);
 251     searcher.close();
 252     reader.close();
 253   }
 254
 255   // Make sure the documents returned by the search match the expected list
 256   // Copied from TestSort.java
 257   private void assertMatches(Searcher searcher, Query query, Sort sort,
 258                              String expectedResult) throws IOException {
 259     ScoreDoc[] result = searcher.search(query, null, 1000, sort).scoreDocs;
 260     StringBuilder buff = new StringBuilder(10);
 261     int n = result.length;
 262     for (int i = 0 ; i < n ; ++i) {
 263       Document doc = searcher.doc(result[i].doc);
 264       String[] v = doc.getValues("tracer");
 265       for (int j = 0 ; j < v.length ; ++j) {
 266         buff.append(v[j]);
 267       }
 268     }
 269     assertEquals(expectedResult, buff.toString());
 270   }
 271
 272   public void assertThreadSafe(final Analyzer analyzer) throws Exception {
 273     int numTestPoints = 100;
 274     int numThreads = _TestUtil.nextInt(random, 3, 5);
 275     final HashMap<String,String> map = new HashMap<String,String>();
 276
 277     // create a map<String,SortKey> up front.
 278     // then with multiple threads, generate sort keys for all the keys in the map
 279     // and ensure they are the same as the ones we produced in serial fashion.
 280
 281     for (int i = 0; i < numTestPoints; i++) {
 282       String term = _TestUtil.randomSimpleString(random);
 283       TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
 284       CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
 285       ts.reset();
 286       assertTrue(ts.incrementToken());
 287       // ensure we make a copy of the actual bytes too
 288       map.put(term, encodedBytes.toString());
 289     }
 290
 291     Thread threads[] = new Thread[numThreads];
 292     for (int i = 0; i < numThreads; i++) {
 293       threads[i] = new Thread() {
 294         @Override
 295         public void run() {
 296           try {
 297             for (Map.Entry<String,String> mapping : map.entrySet()) {
 298               String term = mapping.getKey();
 299               String expected = mapping.getValue();
 300               TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
 301               CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
 302               ts.reset();
 303               assertTrue(ts.incrementToken());
 304               assertEquals(expected, encodedBytes.toString());
 305             }
 306           } catch (IOException e) {
 307             throw new RuntimeException(e);
 308           }
 309         }
 310       };
 311     }
 312     for (int i = 0; i < numThreads; i++) {
 313       threads[i].start();
 314     }
 315     for (int i = 0; i < numThreads; i++) {
 316       threads[i].join();
 317     }
 318   }
 319 }