2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.apache.lucene.analysis.compound.hyphenation;
20 import java.util.Enumeration;
21 import java.util.Stack;
22 import java.io.Serializable;
25 * <h2>Ternary Search Tree.</h2>
28 * A ternary search tree is a hybrid between a binary tree and a digital search
29 * tree (trie). Keys are limited to strings. A data value of type char is stored
30 * in each leaf node. It can be used as an index (or pointer) to the data.
31 * Branches that only contain one key are compressed to one node by storing a
32 * pointer to the trailer substring of the key. This class is intended to serve
33 * as base class or helper class to implement Dictionary collections or the
34 * like. Ternary trees have some nice properties as the following: the tree can
35 * be traversed in sorted order, partial matches (wildcard) can be implemented,
36 * retrieval of all keys within a given distance from the target, etc. The
37 * storage requirements are higher than a binary tree but a lot less than a
38 * trie. Performance is comparable with a hash table, sometimes it outperforms a
39 * hash function (most of the time can determine a miss faster than a hash).
43 * The main purpose of this java port is to serve as a base for implementing
44 * TeX's hyphenation algorithm (see The TeXBook, appendix H). Each language
45 * requires from 5000 to 15000 hyphenation patterns which will be keys in this
46 * tree. The strings patterns are usually small (from 2 to 5 characters), but
47 * each char in the tree is stored in a node. Thus memory usage is the main
48 * concern. We will sacrifice 'elegance' to keep memory requirements to the
49 * minimum. Using java's char type as pointer (yes, I know pointer it is a
50 * forbidden word in java) we can keep the size of the node to be just 8 bytes
51 * (3 pointers and the data char). This gives room for about 65000 nodes. In my
52 * tests the english patterns took 7694 nodes and the german patterns 10055
53 * nodes, so I think we are safe.
57 * All said, this is a map with strings as keys and char as value. Pretty
58 * limited!. It can be extended to a general map by using the string
59 * representation of an object and using the char value as an index to an array
60 * that contains the object values.
63 * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
66 public class TernaryTree implements Cloneable, Serializable {
69 * We use 4 arrays to represent a node. I guess I should have created a proper
70 * node class, but somehow Knuth's pascal code made me forget we now have a
71 * portable language with virtual memory management and automatic garbage
72 * collection! And now is kind of late, furthermore, if it ain't broken, don't
77 * Pointer to low branch and to rest of the key when it is stored directly in
78 * this node, we don't have unions in java!
83 * Pointer to high branch.
88 * Pointer to equal branch and to data when this node is a string terminator.
94 * The character stored in this node: splitchar. Two special values are
98 * <li>0x0000 as string terminator</li>
99 * <li>0xFFFF to indicate that the branch starting at this node is compressed</li>
102 * This shouldn't be a problem if we give the usual semantics to strings since
103 * 0xFFFF is guaranteed not to be an Unicode character.
109 * This vector holds the trailing of the keys when the branch is compressed.
111 protected CharVector kv;
115 protected char freenode;
117 protected int length; // number of items in tree
119 protected static final int BLOCK_SIZE = 2048; // allocation size for arrays
125 protected void init() {
129 lo = new char[BLOCK_SIZE];
130 hi = new char[BLOCK_SIZE];
131 eq = new char[BLOCK_SIZE];
132 sc = new char[BLOCK_SIZE];
133 kv = new CharVector();
137 * Branches are initially compressed, needing one node per key plus the size
138 * of the string key. They are decompressed as needed when another key with
139 * same prefix is inserted. This saves a lot of space, specially for long
142 public void insert(String key, char val) {
143 // make sure we have enough room in the arrays
144 int len = key.length() + 1; // maximum number of nodes that may be generated
145 if (freenode + len > eq.length) {
146 redimNodeArrays(eq.length + BLOCK_SIZE);
148 char strkey[] = new char[len--];
149 key.getChars(0, len, strkey, 0);
151 root = insert(root, strkey, 0, val);
154 public void insert(char[] key, int start, char val) {
155 int len = strlen(key) + 1;
156 if (freenode + len > eq.length) {
157 redimNodeArrays(eq.length + BLOCK_SIZE);
159 root = insert(root, key, start, val);
163 * The actual insertion function, recursive version.
165 private char insert(char p, char[] key, int start, char val) {
166 int len = strlen(key, start);
168 // this means there is no branch, this node will start a new branch.
169 // Instead of doing that, we store the key somewhere else and create
170 // only one node with a pointer to the key
172 eq[p] = val; // holds data
176 sc[p] = 0xFFFF; // indicates branch is compressed
177 lo[p] = (char) kv.alloc(len + 1); // use 'lo' to hold pointer to key
178 strcpy(kv.getArray(), lo[p], key, start);
186 if (sc[p] == 0xFFFF) {
187 // branch is compressed: need to decompress
188 // this will generate garbage in the external key array
189 // but we can do some garbage collection later
190 char pp = freenode++;
191 lo[pp] = lo[p]; // previous pointer to key
192 eq[pp] = eq[p]; // previous pointer to data
195 sc[p] = kv.get(lo[pp]);
198 if (kv.get(lo[pp]) == 0) {
199 // key completly decompressed leaving garbage in key array
204 // we only got first char of key, rest is still there
208 // In this case we can save a node by swapping the new node
209 // with the compressed node
220 lo[p] = insert(lo[p], key, start, val);
221 } else if (s == sc[p]) {
223 eq[p] = insert(eq[p], key, start + 1, val);
225 // key already in tree, overwrite data
229 hi[p] = insert(hi[p], key, start, val);
235 * Compares 2 null terminated char arrays
237 public static int strcmp(char[] a, int startA, char[] b, int startB) {
238 for (; a[startA] == b[startB]; startA++, startB++) {
239 if (a[startA] == 0) {
243 return a[startA] - b[startB];
247 * Compares a string with null terminated char array
249 public static int strcmp(String str, char[] a, int start) {
250 int i, d, len = str.length();
251 for (i = 0; i < len; i++) {
252 d = (int) str.charAt(i) - a[start + i];
256 if (a[start + i] == 0) {
260 if (a[start + i] != 0) {
261 return -a[start + i];
267 public static void strcpy(char[] dst, int di, char[] src, int si) {
268 while (src[si] != 0) {
269 dst[di++] = src[si++];
274 public static int strlen(char[] a, int start) {
276 for (int i = start; i < a.length && a[i] != 0; i++) {
282 public static int strlen(char[] a) {
286 public int find(String key) {
287 int len = key.length();
288 char strkey[] = new char[len + 1];
289 key.getChars(0, len, strkey, 0);
292 return find(strkey, 0);
295 public int find(char[] key, int start) {
302 if (sc[p] == 0xFFFF) {
303 if (strcmp(key, i, kv.getArray(), lo[p]) == 0) {
326 public boolean knows(String key) {
327 return (find(key) >= 0);
330 // redimension the arrays
331 private void redimNodeArrays(int newsize) {
332 int len = newsize < lo.length ? newsize : lo.length;
333 char[] na = new char[newsize];
334 System.arraycopy(lo, 0, na, 0, len);
336 na = new char[newsize];
337 System.arraycopy(hi, 0, na, 0, len);
339 na = new char[newsize];
340 System.arraycopy(eq, 0, na, 0, len);
342 na = new char[newsize];
343 System.arraycopy(sc, 0, na, 0, len);
352 public Object clone() {
353 TernaryTree t = new TernaryTree();
354 t.lo = this.lo.clone();
355 t.hi = this.hi.clone();
356 t.eq = this.eq.clone();
357 t.sc = this.sc.clone();
358 t.kv = (CharVector) this.kv.clone();
360 t.freenode = this.freenode;
361 t.length = this.length;
367 * Recursively insert the median first and then the median of the lower and
368 * upper halves, and so on in order to get a balanced tree. The array of keys
369 * is assumed to be sorted in ascending order.
371 protected void insertBalanced(String[] k, char[] v, int offset, int n) {
378 insert(k[m + offset], v[m + offset]);
379 insertBalanced(k, v, offset, m);
381 insertBalanced(k, v, offset + m + 1, n - m - 1);
385 * Balance the tree for best search performance
387 public void balance() {
388 // System.out.print("Before root splitchar = ");
389 // System.out.println(sc[root]);
391 int i = 0, n = length;
392 String[] k = new String[n];
393 char[] v = new char[n];
394 Iterator iter = new Iterator();
395 while (iter.hasMoreElements()) {
396 v[i] = iter.getValue();
397 k[i++] = iter.nextElement();
400 insertBalanced(k, v, 0, n);
402 // With uniform letter distribution sc[root] should be around 'm'
403 // System.out.print("After root splitchar = ");
404 // System.out.println(sc[root]);
408 * Each node stores a character (splitchar) which is part of some key(s). In a
409 * compressed branch (one that only contain a single string key) the trailer
410 * of the key which is not already in nodes is stored externally in the kv
411 * array. As items are inserted, key substrings decrease. Some substrings may
412 * completely disappear when the whole branch is totally decompressed. The
413 * tree is traversed to find the key substrings actually used. In addition,
414 * duplicate substrings are removed using a map (implemented with a
418 public void trimToSize() {
419 // first balance the tree for best performance
422 // redimension the node arrays
423 redimNodeArrays(freenode);
425 // ok, compact kv array
426 CharVector kx = new CharVector();
428 TernaryTree map = new TernaryTree();
429 compact(kx, map, root);
434 private void compact(CharVector kx, TernaryTree map, char p) {
439 if (sc[p] == 0xFFFF) {
440 k = map.find(kv.getArray(), lo[p]);
442 k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1);
443 strcpy(kx.getArray(), k, kv.getArray(), lo[p]);
444 map.insert(kx.getArray(), k, (char) k);
448 compact(kx, map, lo[p]);
450 compact(kx, map, eq[p]);
452 compact(kx, map, hi[p]);
456 public Enumeration<String> keys() {
457 return new Iterator();
460 public class Iterator implements Enumeration<String> {
472 private class Item implements Cloneable {
482 public Item(char p, char c) {
488 public Object clone() {
489 return new Item(parent, child);
500 * key stack implemented with a StringBuilder
506 ns = new Stack<Item>();
507 ks = new StringBuilder();
511 public void rewind() {
512 ns.removeAllElements();
518 public String nextElement() {
519 String res = new String(curkey);
525 public char getValue() {
532 public boolean hasMoreElements() {
547 if (cur != 0 && sc[cur] == 0) {
551 boolean climb = true;
558 if (sc[i.parent] != 0) {
560 ns.push((Item) i.clone());
561 ks.append(sc[i.parent]);
564 ns.push((Item) i.clone());
572 ns.push((Item) i.clone());
573 if (ks.length() > 0) {
574 ks.setLength(ks.length() - 1); // pop
591 * traverse the tree to find next key
598 boolean leaf = false;
600 // first go down on low branch until leaf or compressed branch
602 if (sc[cur] == 0xFFFF) {
606 ns.push(new Item((char) cur, '\u0000'));
616 // nothing found, go up one node and try again
622 // The current node should be a data node and
623 // the key should be in the key stack (at least partially)
624 StringBuilder buf = new StringBuilder(ks.toString());
625 if (sc[cur] == 0xFFFF) {
627 while (kv.get(p) != 0) {
628 buf.append(kv.get(p++));
631 curkey = buf.toString();
637 public void printStats() {
638 System.out.println("Number of keys = " + Integer.toString(length));
639 System.out.println("Node count = " + Integer.toString(freenode));
640 // System.out.println("Array length = " + Integer.toString(eq.length));
641 System.out.println("Key Array length = " + Integer.toString(kv.length()));
644 * for(int i=0; i<kv.length(); i++) if ( kv.get(i) != 0 )
645 * System.out.print(kv.get(i)); else System.out.println("");
646 * System.out.println("Keys:"); for(Enumeration enum = keys();
647 * enum.hasMoreElements(); ) System.out.println(enum.nextElement());
652 public static void main(String[] args) throws Exception {
653 TernaryTree tt = new TernaryTree();
654 tt.insert("Carlos", 'C');
655 tt.insert("Car", 'r');
656 tt.insert("palos", 'l');
657 tt.insert("pa", 'p');
659 System.out.println((char) tt.find("Car"));
660 System.out.println((char) tt.find("Carlos"));
661 System.out.println((char) tt.find("alto"));