lucene-java-3.4.0/lucene/backwards/src/test/org/apache/lucene/index/TestIndexWriterUnicode.java

   1 package org.apache.lucene.index;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.HashSet;
  22 import java.util.Iterator;
  23 import java.util.Random;
  24 import java.util.Set;
  25
  26 import org.apache.lucene.analysis.MockAnalyzer;
  27 import org.apache.lucene.document.Document;
  28 import org.apache.lucene.document.Field;
  29 import org.apache.lucene.store.Directory;
  30 import org.apache.lucene.util.BytesRef;
  31 import org.apache.lucene.util.LuceneTestCase;
  32 import org.apache.lucene.util.UnicodeUtil;
  33
  34 public class TestIndexWriterUnicode extends LuceneTestCase {
  35
  36   final String[] utf8Data = new String[] {
  37     // unpaired low surrogate
  38     "ab\udc17cd", "ab\ufffdcd",
  39     "\udc17abcd", "\ufffdabcd",
  40     "\udc17", "\ufffd",
  41     "ab\udc17\udc17cd", "ab\ufffd\ufffdcd",
  42     "\udc17\udc17abcd", "\ufffd\ufffdabcd",
  43     "\udc17\udc17", "\ufffd\ufffd",
  44
  45     // unpaired high surrogate
  46     "ab\ud917cd", "ab\ufffdcd",
  47     "\ud917abcd", "\ufffdabcd",
  48     "\ud917", "\ufffd",
  49     "ab\ud917\ud917cd", "ab\ufffd\ufffdcd",
  50     "\ud917\ud917abcd", "\ufffd\ufffdabcd",
  51     "\ud917\ud917", "\ufffd\ufffd",
  52
  53     // backwards surrogates
  54     "ab\udc17\ud917cd", "ab\ufffd\ufffdcd",
  55     "\udc17\ud917abcd", "\ufffd\ufffdabcd",
  56     "\udc17\ud917", "\ufffd\ufffd",
  57     "ab\udc17\ud917\udc17\ud917cd", "ab\ufffd\ud917\udc17\ufffdcd",
  58     "\udc17\ud917\udc17\ud917abcd", "\ufffd\ud917\udc17\ufffdabcd",
  59     "\udc17\ud917\udc17\ud917", "\ufffd\ud917\udc17\ufffd"
  60   };
  61
  62   private int nextInt(int lim) {
  63     return random.nextInt(lim);
  64   }
  65
  66   private int nextInt(int start, int end) {
  67     return start + nextInt(end-start);
  68   }
  69
  70   private boolean fillUnicode(char[] buffer, char[] expected, int offset, int count) {
  71     final int len = offset + count;
  72     boolean hasIllegal = false;
  73
  74     if (offset > 0 && buffer[offset] >= 0xdc00 && buffer[offset] < 0xe000)
  75       // Don't start in the middle of a valid surrogate pair
  76       offset--;
  77
  78     for(int i=offset;i<len;i++) {
  79       int t = nextInt(6);
  80       if (0 == t && i < len-1) {
  81         // Make a surrogate pair
  82         // High surrogate
  83         expected[i] = buffer[i++] = (char) nextInt(0xd800, 0xdc00);
  84         // Low surrogate
  85         expected[i] = buffer[i] = (char) nextInt(0xdc00, 0xe000);
  86       } else if (t <= 1)
  87         expected[i] = buffer[i] = (char) nextInt(0x80);
  88       else if (2 == t)
  89         expected[i] = buffer[i] = (char) nextInt(0x80, 0x800);
  90       else if (3 == t)
  91         expected[i] = buffer[i] = (char) nextInt(0x800, 0xd800);
  92       else if (4 == t)
  93         expected[i] = buffer[i] = (char) nextInt(0xe000, 0xffff);
  94       else if (5 == t && i < len-1) {
  95         // Illegal unpaired surrogate
  96         if (nextInt(10) == 7) {
  97           if (random.nextBoolean())
  98             buffer[i] = (char) nextInt(0xd800, 0xdc00);
  99           else
 100             buffer[i] = (char) nextInt(0xdc00, 0xe000);
 101           expected[i++] = 0xfffd;
 102           expected[i] = buffer[i] = (char) nextInt(0x800, 0xd800);
 103           hasIllegal = true;
 104         } else
 105           expected[i] = buffer[i] = (char) nextInt(0x800, 0xd800);
 106       } else {
 107         expected[i] = buffer[i] = ' ';
 108       }
 109     }
 110
 111     return hasIllegal;
 112   }
 113
 114   // both start & end are inclusive
 115   private final int getInt(Random r, int start, int end) {
 116     return start + r.nextInt(1+end-start);
 117   }
 118
 119   private final String asUnicodeChar(char c) {
 120     return "U+" + Integer.toHexString(c);
 121   }
 122
 123   private final String termDesc(String s) {
 124     final String s0;
 125     assertTrue(s.length() <= 2);
 126     if (s.length() == 1) {
 127       s0 = asUnicodeChar(s.charAt(0));
 128     } else {
 129       s0 = asUnicodeChar(s.charAt(0)) + "," + asUnicodeChar(s.charAt(1));
 130     }
 131     return s0;
 132   }
 133
 134   // LUCENE-510
 135   public void testRandomUnicodeStrings() throws Throwable {
 136     char[] buffer = new char[20];
 137     char[] expected = new char[20];
 138
 139     UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
 140     UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
 141
 142     int num = atLeast(100000);
 143     for (int iter = 0; iter < num; iter++) {
 144       boolean hasIllegal = fillUnicode(buffer, expected, 0, 20);
 145
 146       UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8);
 147       if (!hasIllegal) {
 148         byte[] b = new String(buffer, 0, 20).getBytes("UTF-8");
 149         assertEquals(b.length, utf8.length);
 150         for(int i=0;i<b.length;i++)
 151           assertEquals(b[i], utf8.result[i]);
 152       }
 153
 154       UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16);
 155       assertEquals(utf16.length, 20);
 156       for(int i=0;i<20;i++)
 157         assertEquals(expected[i], utf16.result[i]);
 158     }
 159   }
 160
 161   // LUCENE-510
 162   public void testAllUnicodeChars() throws Throwable {
 163
 164     UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
 165     UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
 166     char[] chars = new char[2];
 167     for(int ch=0;ch<0x0010FFFF;ch++) {
 168
 169       if (ch == 0xd800)
 170         // Skip invalid code points
 171         ch = 0xe000;
 172
 173       int len = 0;
 174       if (ch <= 0xffff) {
 175         chars[len++] = (char) ch;
 176       } else {
 177         chars[len++] = (char) (((ch-0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
 178         chars[len++] = (char) (((ch-0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
 179       }
 180
 181       UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8);
 182
 183       String s1 = new String(chars, 0, len);
 184       String s2 = new String(utf8.result, 0, utf8.length, "UTF-8");
 185       assertEquals("codepoint " + ch, s1, s2);
 186
 187       UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16);
 188       assertEquals("codepoint " + ch, s1, new String(utf16.result, 0, utf16.length));
 189
 190       byte[] b = s1.getBytes("UTF-8");
 191       assertEquals(utf8.length, b.length);
 192       for(int j=0;j<utf8.length;j++)
 193         assertEquals(utf8.result[j], b[j]);
 194     }
 195   }
 196
 197   public void testEmbeddedFFFF() throws Throwable {
 198
 199     Directory d = newDirectory();
 200     IndexWriter w = new IndexWriter(d, newIndexWriterConfig( TEST_VERSION_CURRENT, new TestIndexWriter.StringSplitAnalyzer()));
 201     Document doc = new Document();
 202     doc.add(newField("field", "a a\uffffb", Field.Store.NO, Field.Index.ANALYZED));
 203     w.addDocument(doc);
 204     doc = new Document();
 205     doc.add(newField("field", "a", Field.Store.NO, Field.Index.ANALYZED));
 206     w.addDocument(doc);
 207     w.close();
 208
 209     d.close();
 210   }
 211
 212   // LUCENE-510
 213   public void testInvalidUTF16() throws Throwable {
 214     Directory dir = newDirectory();
 215     IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new TestIndexWriter.StringSplitAnalyzer()));
 216     Document doc = new Document();
 217
 218     final int count = utf8Data.length/2;
 219     for(int i=0;i<count;i++)
 220       doc.add(newField("f" + i, utf8Data[2*i], Field.Store.YES, Field.Index.ANALYZED));
 221     w.addDocument(doc);
 222     w.close();
 223
 224     IndexReader ir = IndexReader.open(dir, true);
 225     Document doc2 = ir.document(0);
 226     for(int i=0;i<count;i++) {
 227       assertEquals("field " + i + " was not indexed correctly", 1, ir.docFreq(new Term("f"+i, utf8Data[2*i+1])));
 228       assertEquals("field " + i + " is incorrect", utf8Data[2*i+1], doc2.getField("f"+i).stringValue());
 229     }
 230     ir.close();
 231     dir.close();
 232   }
 233
 234   // LUCENE-510
 235   public void testIncrementalUnicodeStrings() throws Throwable {
 236     char[] buffer = new char[20];
 237     char[] expected = new char[20];
 238
 239     UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
 240     UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
 241     UnicodeUtil.UTF16Result utf16a = new UnicodeUtil.UTF16Result();
 242
 243     boolean hasIllegal = false;
 244     byte[] last = new byte[60];
 245
 246     int num = atLeast(100000);
 247     for (int iter = 0; iter < num; iter++) {
 248
 249       final int prefix;
 250
 251       if (iter == 0 || hasIllegal)
 252         prefix = 0;
 253       else
 254         prefix = nextInt(20);
 255
 256       hasIllegal = fillUnicode(buffer, expected, prefix, 20-prefix);
 257
 258       UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8);
 259       if (!hasIllegal) {
 260         byte[] b = new String(buffer, 0, 20).getBytes("UTF-8");
 261         assertEquals(b.length, utf8.length);
 262         for(int i=0;i<b.length;i++)
 263           assertEquals(b[i], utf8.result[i]);
 264       }
 265
 266       int bytePrefix = 20;
 267       if (iter == 0 || hasIllegal)
 268         bytePrefix = 0;
 269       else
 270         for(int i=0;i<20;i++)
 271           if (last[i] != utf8.result[i]) {
 272             bytePrefix = i;
 273             break;
 274           }
 275       System.arraycopy(utf8.result, 0, last, 0, utf8.length);
 276
 277       UnicodeUtil.UTF8toUTF16(utf8.result, bytePrefix, utf8.length-bytePrefix, utf16);
 278       assertEquals(20, utf16.length);
 279       for(int i=0;i<20;i++)
 280         assertEquals(expected[i], utf16.result[i]);
 281
 282       UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16a);
 283       assertEquals(20, utf16a.length);
 284       for(int i=0;i<20;i++)
 285         assertEquals(expected[i], utf16a.result[i]);
 286     }
 287   }
 288 }