pylucene 3.5.0-3
[pylucene.git] / lucene-java-3.5.0 / lucene / src / test-framework / java / org / apache / lucene / analysis / CollationTestBase.java
diff --git a/lucene-java-3.5.0/lucene/src/test-framework/java/org/apache/lucene/analysis/CollationTestBase.java b/lucene-java-3.5.0/lucene/src/test-framework/java/org/apache/lucene/analysis/CollationTestBase.java
new file mode 100644 (file)
index 0000000..97ec33b
--- /dev/null
@@ -0,0 +1,319 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermRangeFilter;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermRangeQuery;
+import org.apache.lucene.search.Searcher;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.util.IndexableBinaryStringTools;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+public abstract class CollationTestBase extends LuceneTestCase {
+
+  protected String firstRangeBeginningOriginal = "\u062F";
+  protected String firstRangeEndOriginal = "\u0698";
+  
+  protected String secondRangeBeginningOriginal = "\u0633";
+  protected String secondRangeEndOriginal = "\u0638";
+  
+  /**
+   * Convenience method to perform the same function as CollationKeyFilter.
+   *  
+   * @param keyBits the result from 
+   *  collator.getCollationKey(original).toByteArray()
+   * @return The encoded collation key for the original String
+   */
+  protected String encodeCollationKey(byte[] keyBits) {
+    // Ensure that the backing char[] array is large enough to hold the encoded
+    // Binary String
+    int encodedLength = IndexableBinaryStringTools.getEncodedLength(keyBits, 0, keyBits.length);
+    char[] encodedBegArray = new char[encodedLength];
+    IndexableBinaryStringTools.encode(keyBits, 0, keyBits.length, encodedBegArray, 0, encodedLength);
+    return new String(encodedBegArray);
+  }
+    
+  public void testFarsiRangeFilterCollating(Analyzer analyzer, String firstBeg, 
+                                            String firstEnd, String secondBeg,
+                                            String secondEnd) throws Exception {
+    RAMDirectory ramDir = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
+        TEST_VERSION_CURRENT, analyzer));
+    Document doc = new Document();
+    doc.add(new Field("content", "\u0633\u0627\u0628", 
+                      Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("body", "body",
+                      Field.Store.YES, Field.Index.NOT_ANALYZED));
+    writer.addDocument(doc);
+    writer.close();
+    IndexReader reader = IndexReader.open(ramDir);
+    IndexSearcher searcher = new IndexSearcher(reader);
+    Query query = new TermQuery(new Term("body","body"));
+
+    // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
+    // orders the U+0698 character before the U+0633 character, so the single
+    // index Term below should NOT be returned by a TermRangeFilter with a Farsi
+    // Collator (or an Arabic one for the case when Farsi searcher not
+    // supported).
+    ScoreDoc[] result = searcher.search
+      (query, new TermRangeFilter("content", firstBeg, firstEnd, true, true), 1).scoreDocs;
+    assertEquals("The index Term should not be included.", 0, result.length);
+
+    result = searcher.search
+      (query, new TermRangeFilter("content", secondBeg, secondEnd, true, true), 1).scoreDocs;
+    assertEquals("The index Term should be included.", 1, result.length);
+
+    searcher.close();
+    reader.close();
+  }
+  public void testFarsiRangeQueryCollating(Analyzer analyzer, String firstBeg, 
+                                            String firstEnd, String secondBeg,
+                                            String secondEnd) throws Exception {
+    RAMDirectory ramDir = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
+        TEST_VERSION_CURRENT, analyzer));
+    Document doc = new Document();
+
+    // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
+    // orders the U+0698 character before the U+0633 character, so the single
+    // index Term below should NOT be returned by a TermRangeQuery with a Farsi
+    // Collator (or an Arabic one for the case when Farsi is not supported).
+    doc.add(new Field("content", "\u0633\u0627\u0628", 
+                      Field.Store.YES, Field.Index.ANALYZED));
+    writer.addDocument(doc);
+    writer.close();
+    IndexReader reader = IndexReader.open(ramDir);
+    IndexSearcher searcher = new IndexSearcher(reader);
+
+    Query query = new TermRangeQuery("content", firstBeg, firstEnd, true, true);
+    ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
+    assertEquals("The index Term should not be included.", 0, hits.length);
+
+    query = new TermRangeQuery("content", secondBeg, secondEnd, true, true);
+    hits = searcher.search(query, null, 1000).scoreDocs;
+    assertEquals("The index Term should be included.", 1, hits.length);
+    searcher.close();
+    reader.close();
+  }
+
+  public void testFarsiTermRangeQuery(Analyzer analyzer, String firstBeg,
+      String firstEnd, String secondBeg, String secondEnd) throws Exception {
+
+    RAMDirectory farsiIndex = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(
+        TEST_VERSION_CURRENT, analyzer));
+    Document doc = new Document();
+    doc.add(new Field("content", "\u0633\u0627\u0628", 
+                      Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("body", "body",
+                      Field.Store.YES, Field.Index.NOT_ANALYZED));
+    writer.addDocument(doc);
+    writer.close();
+
+    IndexReader reader = IndexReader.open(farsiIndex, true);
+    IndexSearcher search = newSearcher(reader);
+        
+    // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
+    // orders the U+0698 character before the U+0633 character, so the single
+    // index Term below should NOT be returned by a TermRangeQuery
+    // with a Farsi Collator (or an Arabic one for the case when Farsi is 
+    // not supported).
+    Query csrq 
+      = new TermRangeQuery("content", firstBeg, firstEnd, true, true);
+    ScoreDoc[] result = search.search(csrq, null, 1000).scoreDocs;
+    assertEquals("The index Term should not be included.", 0, result.length);
+
+    csrq = new TermRangeQuery
+      ("content", secondBeg, secondEnd, true, true);
+    result = search.search(csrq, null, 1000).scoreDocs;
+    assertEquals("The index Term should be included.", 1, result.length);
+    search.close();
+  }
+  
+  // Test using various international locales with accented characters (which
+  // sort differently depending on locale)
+  //
+  // Copied (and slightly modified) from 
+  // org.apache.lucene.search.TestSort.testInternationalSort()
+  //  
+  // TODO: this test is really fragile. there are already 3 different cases,
+  // depending upon unicode version.
+  public void testCollationKeySort(Analyzer usAnalyzer,
+                                   Analyzer franceAnalyzer,
+                                   Analyzer swedenAnalyzer,
+                                   Analyzer denmarkAnalyzer,
+                                   String usResult,
+                                   String frResult,
+                                   String svResult,
+                                   String dkResult) throws Exception {
+    RAMDirectory indexStore = new RAMDirectory();
+    PerFieldAnalyzerWrapper analyzer
+      = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
+    analyzer.addAnalyzer("US", usAnalyzer);
+    analyzer.addAnalyzer("France", franceAnalyzer);
+    analyzer.addAnalyzer("Sweden", swedenAnalyzer);
+    analyzer.addAnalyzer("Denmark", denmarkAnalyzer);
+    IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(
+        TEST_VERSION_CURRENT, analyzer));
+
+    // document data:
+    // the tracer field is used to determine which document was hit
+    String[][] sortData = new String[][] {
+      // tracer contents US                 France             Sweden (sv_SE)     Denmark (da_DK)
+      {  "A",   "x",     "p\u00EAche",      "p\u00EAche",      "p\u00EAche",      "p\u00EAche"      },
+      {  "B",   "y",     "HAT",             "HAT",             "HAT",             "HAT"             },
+      {  "C",   "x",     "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9" },
+      {  "D",   "y",     "HUT",             "HUT",             "HUT",             "HUT"             },
+      {  "E",   "x",     "peach",           "peach",           "peach",           "peach"           },
+      {  "F",   "y",     "H\u00C5T",        "H\u00C5T",        "H\u00C5T",        "H\u00C5T"        },
+      {  "G",   "x",     "sin",             "sin",             "sin",             "sin"             },
+      {  "H",   "y",     "H\u00D8T",        "H\u00D8T",        "H\u00D8T",        "H\u00D8T"        },
+      {  "I",   "x",     "s\u00EDn",        "s\u00EDn",        "s\u00EDn",        "s\u00EDn"        },
+      {  "J",   "y",     "HOT",             "HOT",             "HOT",             "HOT"             },
+    };
+
+    for (int i = 0 ; i < sortData.length ; ++i) {
+      Document doc = new Document();
+      doc.add(new Field("tracer", sortData[i][0], 
+                        Field.Store.YES, Field.Index.NO));
+      doc.add(new Field("contents", sortData[i][1], 
+                        Field.Store.NO, Field.Index.ANALYZED));
+      if (sortData[i][2] != null) 
+        doc.add(new Field("US", sortData[i][2], 
+                          Field.Store.NO, Field.Index.ANALYZED));
+      if (sortData[i][3] != null) 
+        doc.add(new Field("France", sortData[i][3], 
+                          Field.Store.NO, Field.Index.ANALYZED));
+      if (sortData[i][4] != null)
+        doc.add(new Field("Sweden", sortData[i][4], 
+                          Field.Store.NO, Field.Index.ANALYZED));
+      if (sortData[i][5] != null) 
+        doc.add(new Field("Denmark", sortData[i][5], 
+                          Field.Store.NO, Field.Index.ANALYZED));
+      writer.addDocument(doc);
+    }
+    writer.forceMerge(1);
+    writer.close();
+    IndexReader reader = IndexReader.open(indexStore);
+    IndexSearcher searcher = new IndexSearcher(reader);
+
+    Sort sort = new Sort();
+    Query queryX = new TermQuery(new Term ("contents", "x"));
+    Query queryY = new TermQuery(new Term ("contents", "y"));
+    
+    sort.setSort(new SortField("US", SortField.STRING));
+    assertMatches(searcher, queryY, sort, usResult);
+
+    sort.setSort(new SortField("France", SortField.STRING));
+    assertMatches(searcher, queryX, sort, frResult);
+
+    sort.setSort(new SortField("Sweden", SortField.STRING));
+    assertMatches(searcher, queryY, sort, svResult);
+
+    sort.setSort(new SortField("Denmark", SortField.STRING));
+    assertMatches(searcher, queryY, sort, dkResult);
+    searcher.close();
+    reader.close();
+  }
+    
+  // Make sure the documents returned by the search match the expected list
+  // Copied from TestSort.java
+  private void assertMatches(Searcher searcher, Query query, Sort sort, 
+                             String expectedResult) throws IOException {
+    ScoreDoc[] result = searcher.search(query, null, 1000, sort).scoreDocs;
+    StringBuilder buff = new StringBuilder(10);
+    int n = result.length;
+    for (int i = 0 ; i < n ; ++i) {
+      Document doc = searcher.doc(result[i].doc);
+      String[] v = doc.getValues("tracer");
+      for (int j = 0 ; j < v.length ; ++j) {
+        buff.append(v[j]);
+      }
+    }
+    assertEquals(expectedResult, buff.toString());
+  }
+
+  public void assertThreadSafe(final Analyzer analyzer) throws Exception {
+    int numTestPoints = 100;
+    int numThreads = _TestUtil.nextInt(random, 3, 5);
+    final HashMap<String,String> map = new HashMap<String,String>();
+    
+    // create a map<String,SortKey> up front.
+    // then with multiple threads, generate sort keys for all the keys in the map
+    // and ensure they are the same as the ones we produced in serial fashion.
+
+    for (int i = 0; i < numTestPoints; i++) {
+      String term = _TestUtil.randomSimpleString(random);
+      TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
+      CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
+      ts.reset();
+      assertTrue(ts.incrementToken());
+      // ensure we make a copy of the actual bytes too
+      map.put(term, encodedBytes.toString());
+    }
+    
+    Thread threads[] = new Thread[numThreads];
+    for (int i = 0; i < numThreads; i++) {
+      threads[i] = new Thread() {
+        @Override
+        public void run() {
+          try {
+            for (Map.Entry<String,String> mapping : map.entrySet()) {
+              String term = mapping.getKey();
+              String expected = mapping.getValue();
+              TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
+              CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
+              ts.reset();
+              assertTrue(ts.incrementToken());
+              assertEquals(expected, encodedBytes.toString());
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+        }
+      };
+    }
+    for (int i = 0; i < numThreads; i++) {
+      threads[i].start();
+    }
+    for (int i = 0; i < numThreads; i++) {
+      threads[i].join();
+    }
+  }
+}