lucene-java-3.4.0/lucene/backwards/src/test/org/apache/lucene/collation/CollationTestBase.java

   1 package org.apache.lucene.collation;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20
  21 import org.apache.lucene.analysis.Analyzer;
  22 import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
  23 import org.apache.lucene.analysis.WhitespaceAnalyzer;
  24 import org.apache.lucene.analysis.TokenStream;
  25 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  26 import org.apache.lucene.store.RAMDirectory;
  27 import org.apache.lucene.index.IndexWriter;
  28 import org.apache.lucene.index.IndexWriterConfig;
  29 import org.apache.lucene.index.Term;
  30 import org.apache.lucene.index.IndexReader;
  31 import org.apache.lucene.search.IndexSearcher;
  32 import org.apache.lucene.search.ScoreDoc;
  33 import org.apache.lucene.search.Query;
  34 import org.apache.lucene.search.TermRangeFilter;
  35 import org.apache.lucene.search.TermQuery;
  36 import org.apache.lucene.search.TermRangeQuery;
  37 import org.apache.lucene.search.Searcher;
  38 import org.apache.lucene.search.Sort;
  39 import org.apache.lucene.search.SortField;
  40 import org.apache.lucene.document.Field;
  41 import org.apache.lucene.document.Document;
  42 import org.apache.lucene.util.IndexableBinaryStringTools;
  43 import org.apache.lucene.util.LuceneTestCase;
  44 import org.apache.lucene.util._TestUtil;
  45
  46 import java.io.IOException;
  47 import java.io.StringReader;
  48 import java.util.HashMap;
  49 import java.util.Map;
  50
  51 public abstract class CollationTestBase extends LuceneTestCase {
  52
  53   protected String firstRangeBeginningOriginal = "\u062F";
  54   protected String firstRangeEndOriginal = "\u0698";
  55
  56   protected String secondRangeBeginningOriginal = "\u0633";
  57   protected String secondRangeEndOriginal = "\u0638";
  58
  59   /**
  60    * Convenience method to perform the same function as CollationKeyFilter.
  61    *
  62    * @param keyBits the result from
  63    *  collator.getCollationKey(original).toByteArray()
  64    * @return The encoded collation key for the original String
  65    */
  66   protected String encodeCollationKey(byte[] keyBits) {
  67     // Ensure that the backing char[] array is large enough to hold the encoded
  68     // Binary String
  69     int encodedLength = IndexableBinaryStringTools.getEncodedLength(keyBits, 0, keyBits.length);
  70     char[] encodedBegArray = new char[encodedLength];
  71     IndexableBinaryStringTools.encode(keyBits, 0, keyBits.length, encodedBegArray, 0, encodedLength);
  72     return new String(encodedBegArray);
  73   }
  74
  75   public void testFarsiRangeFilterCollating(Analyzer analyzer, String firstBeg,
  76                                             String firstEnd, String secondBeg,
  77                                             String secondEnd) throws Exception {
  78     RAMDirectory ramDir = new RAMDirectory();
  79     IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
  80         TEST_VERSION_CURRENT, analyzer));
  81     Document doc = new Document();
  82     doc.add(new Field("content", "\u0633\u0627\u0628",
  83                       Field.Store.YES, Field.Index.ANALYZED));
  84     doc.add(new Field("body", "body",
  85                       Field.Store.YES, Field.Index.NOT_ANALYZED));
  86     writer.addDocument(doc);
  87     writer.close();
  88     IndexSearcher searcher = new IndexSearcher(ramDir, true);
  89     Query query = new TermQuery(new Term("body","body"));
  90
  91     // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
  92     // orders the U+0698 character before the U+0633 character, so the single
  93     // index Term below should NOT be returned by a TermRangeFilter with a Farsi
  94     // Collator (or an Arabic one for the case when Farsi searcher not
  95     // supported).
  96     ScoreDoc[] result = searcher.search
  97       (query, new TermRangeFilter("content", firstBeg, firstEnd, true, true), 1).scoreDocs;
  98     assertEquals("The index Term should not be included.", 0, result.length);
  99
 100     result = searcher.search
 101       (query, new TermRangeFilter("content", secondBeg, secondEnd, true, true), 1).scoreDocs;
 102     assertEquals("The index Term should be included.", 1, result.length);
 103
 104     searcher.close();
 105   }
 106
 107   public void testFarsiRangeQueryCollating(Analyzer analyzer, String firstBeg,
 108                                             String firstEnd, String secondBeg,
 109                                             String secondEnd) throws Exception {
 110     RAMDirectory ramDir = new RAMDirectory();
 111     IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
 112         TEST_VERSION_CURRENT, analyzer));
 113     Document doc = new Document();
 114
 115     // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
 116     // orders the U+0698 character before the U+0633 character, so the single
 117     // index Term below should NOT be returned by a TermRangeQuery with a Farsi
 118     // Collator (or an Arabic one for the case when Farsi is not supported).
 119     doc.add(new Field("content", "\u0633\u0627\u0628",
 120                       Field.Store.YES, Field.Index.ANALYZED));
 121     writer.addDocument(doc);
 122     writer.close();
 123     IndexSearcher searcher = new IndexSearcher(ramDir, true);
 124
 125     Query query = new TermRangeQuery("content", firstBeg, firstEnd, true, true);
 126     ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
 127     assertEquals("The index Term should not be included.", 0, hits.length);
 128
 129     query = new TermRangeQuery("content", secondBeg, secondEnd, true, true);
 130     hits = searcher.search(query, null, 1000).scoreDocs;
 131     assertEquals("The index Term should be included.", 1, hits.length);
 132     searcher.close();
 133   }
 134
 135   public void testFarsiTermRangeQuery(Analyzer analyzer, String firstBeg,
 136       String firstEnd, String secondBeg, String secondEnd) throws Exception {
 137
 138     RAMDirectory farsiIndex = new RAMDirectory();
 139     IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(
 140         TEST_VERSION_CURRENT, analyzer));
 141     Document doc = new Document();
 142     doc.add(new Field("content", "\u0633\u0627\u0628",
 143                       Field.Store.YES, Field.Index.ANALYZED));
 144     doc.add(new Field("body", "body",
 145                       Field.Store.YES, Field.Index.NOT_ANALYZED));
 146     writer.addDocument(doc);
 147     writer.close();
 148
 149     IndexReader reader = IndexReader.open(farsiIndex, true);
 150     IndexSearcher search = newSearcher(reader);
 151
 152     // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
 153     // orders the U+0698 character before the U+0633 character, so the single
 154     // index Term below should NOT be returned by a TermRangeQuery
 155     // with a Farsi Collator (or an Arabic one for the case when Farsi is
 156     // not supported).
 157     Query csrq
 158       = new TermRangeQuery("content", firstBeg, firstEnd, true, true);
 159     ScoreDoc[] result = search.search(csrq, null, 1000).scoreDocs;
 160     assertEquals("The index Term should not be included.", 0, result.length);
 161
 162     csrq = new TermRangeQuery
 163       ("content", secondBeg, secondEnd, true, true);
 164     result = search.search(csrq, null, 1000).scoreDocs;
 165     assertEquals("The index Term should be included.", 1, result.length);
 166     search.close();
 167   }
 168
 169   // Test using various international locales with accented characters (which
 170   // sort differently depending on locale)
 171   //
 172   // Copied (and slightly modified) from
 173   // org.apache.lucene.search.TestSort.testInternationalSort()
 174   //
 175   // TODO: this test is really fragile. there are already 3 different cases,
 176   // depending upon unicode version.
 177   public void testCollationKeySort(Analyzer usAnalyzer,
 178                                    Analyzer franceAnalyzer,
 179                                    Analyzer swedenAnalyzer,
 180                                    Analyzer denmarkAnalyzer,
 181                                    String usResult,
 182                                    String frResult,
 183                                    String svResult,
 184                                    String dkResult) throws Exception {
 185     RAMDirectory indexStore = new RAMDirectory();
 186     PerFieldAnalyzerWrapper analyzer
 187       = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
 188     analyzer.addAnalyzer("US", usAnalyzer);
 189     analyzer.addAnalyzer("France", franceAnalyzer);
 190     analyzer.addAnalyzer("Sweden", swedenAnalyzer);
 191     analyzer.addAnalyzer("Denmark", denmarkAnalyzer);
 192     IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(
 193         TEST_VERSION_CURRENT, analyzer));
 194
 195     // document data:
 196     // the tracer field is used to determine which document was hit
 197     String[][] sortData = new String[][] {
 198       // tracer contents US                 France             Sweden (sv_SE)     Denmark (da_DK)
 199       {  "A",   "x",     "p\u00EAche",      "p\u00EAche",      "p\u00EAche",      "p\u00EAche"      },
 200       {  "B",   "y",     "HAT",             "HAT",             "HAT",             "HAT"             },
 201       {  "C",   "x",     "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9" },
 202       {  "D",   "y",     "HUT",             "HUT",             "HUT",             "HUT"             },
 203       {  "E",   "x",     "peach",           "peach",           "peach",           "peach"           },
 204       {  "F",   "y",     "H\u00C5T",        "H\u00C5T",        "H\u00C5T",        "H\u00C5T"        },
 205       {  "G",   "x",     "sin",             "sin",             "sin",             "sin"             },
 206       {  "H",   "y",     "H\u00D8T",        "H\u00D8T",        "H\u00D8T",        "H\u00D8T"        },
 207       {  "I",   "x",     "s\u00EDn",        "s\u00EDn",        "s\u00EDn",        "s\u00EDn"        },
 208       {  "J",   "y",     "HOT",             "HOT",             "HOT",             "HOT"             },
 209     };
 210
 211     for (int i = 0 ; i < sortData.length ; ++i) {
 212       Document doc = new Document();
 213       doc.add(new Field("tracer", sortData[i][0],
 214                         Field.Store.YES, Field.Index.NO));
 215       doc.add(new Field("contents", sortData[i][1],
 216                         Field.Store.NO, Field.Index.ANALYZED));
 217       if (sortData[i][2] != null)
 218         doc.add(new Field("US", sortData[i][2],
 219                           Field.Store.NO, Field.Index.ANALYZED));
 220       if (sortData[i][3] != null)
 221         doc.add(new Field("France", sortData[i][3],
 222                           Field.Store.NO, Field.Index.ANALYZED));
 223       if (sortData[i][4] != null)
 224         doc.add(new Field("Sweden", sortData[i][4],
 225                           Field.Store.NO, Field.Index.ANALYZED));
 226       if (sortData[i][5] != null)
 227         doc.add(new Field("Denmark", sortData[i][5],
 228                           Field.Store.NO, Field.Index.ANALYZED));
 229       writer.addDocument(doc);
 230     }
 231     writer.optimize();
 232     writer.close();
 233     Searcher searcher = new IndexSearcher(indexStore, true);
 234
 235     Sort sort = new Sort();
 236     Query queryX = new TermQuery(new Term ("contents", "x"));
 237     Query queryY = new TermQuery(new Term ("contents", "y"));
 238
 239     sort.setSort(new SortField("US", SortField.STRING));
 240     assertMatches(searcher, queryY, sort, usResult);
 241
 242     sort.setSort(new SortField("France", SortField.STRING));
 243     assertMatches(searcher, queryX, sort, frResult);
 244
 245     sort.setSort(new SortField("Sweden", SortField.STRING));
 246     assertMatches(searcher, queryY, sort, svResult);
 247
 248     sort.setSort(new SortField("Denmark", SortField.STRING));
 249     assertMatches(searcher, queryY, sort, dkResult);
 250   }
 251
 252   // Make sure the documents returned by the search match the expected list
 253   // Copied from TestSort.java
 254   private void assertMatches(Searcher searcher, Query query, Sort sort,
 255                              String expectedResult) throws IOException {
 256     ScoreDoc[] result = searcher.search(query, null, 1000, sort).scoreDocs;
 257     StringBuilder buff = new StringBuilder(10);
 258     int n = result.length;
 259     for (int i = 0 ; i < n ; ++i) {
 260       Document doc = searcher.doc(result[i].doc);
 261       String[] v = doc.getValues("tracer");
 262       for (int j = 0 ; j < v.length ; ++j) {
 263         buff.append(v[j]);
 264       }
 265     }
 266     assertEquals(expectedResult, buff.toString());
 267   }
 268
 269   public void assertThreadSafe(final Analyzer analyzer) throws Exception {
 270     int numTestPoints = 100;
 271     int numThreads = _TestUtil.nextInt(random, 3, 5);
 272     final HashMap<String,String> map = new HashMap<String,String>();
 273
 274     // create a map<String,SortKey> up front.
 275     // then with multiple threads, generate sort keys for all the keys in the map
 276     // and ensure they are the same as the ones we produced in serial fashion.
 277
 278     for (int i = 0; i < numTestPoints; i++) {
 279       String term = _TestUtil.randomSimpleString(random);
 280       TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
 281       CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
 282       ts.reset();
 283       assertTrue(ts.incrementToken());
 284       // ensure we make a copy of the actual bytes too
 285       map.put(term, encodedBytes.toString());
 286     }
 287
 288     Thread threads[] = new Thread[numThreads];
 289     for (int i = 0; i < numThreads; i++) {
 290       threads[i] = new Thread() {
 291         @Override
 292         public void run() {
 293           try {
 294             for (Map.Entry<String,String> mapping : map.entrySet()) {
 295               String term = mapping.getKey();
 296               String expected = mapping.getValue();
 297               TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
 298               CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
 299               ts.reset();
 300               assertTrue(ts.incrementToken());
 301               assertEquals(expected, encodedBytes.toString());
 302             }
 303           } catch (IOException e) {
 304             throw new RuntimeException(e);
 305           }
 306         }
 307       };
 308     }
 309     for (int i = 0; i < numThreads; i++) {
 310       threads[i].start();
 311     }
 312     for (int i = 0; i < numThreads; i++) {
 313       threads[i].join();
 314     }
 315   }
 316 }