X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/icu/src/java/overview.html diff --git a/lucene-java-3.4.0/lucene/contrib/icu/src/java/overview.html b/lucene-java-3.4.0/lucene/contrib/icu/src/java/overview.html deleted file mode 100644 index 0e55ea7..0000000 --- a/lucene-java-3.4.0/lucene/contrib/icu/src/java/overview.html +++ /dev/null @@ -1,382 +0,0 @@ - - -
- --This module exposes functionality from -ICU to Apache Lucene. ICU4J is a Java -library that enhances Java's internationalization support by improving -performance, keeping current with the Unicode Standard, and providing richer -APIs. This module exposes the following functionality: -
--Text Segmentation (Tokenization) divides document and query text into index terms -(typically words). Unicode provides special properties and rules so that this can -be done in a manner that works well with most languages. -
--Text Segmentation implements the word segmentation specified in -Unicode Text Segmentation. -Additionally the algorithm can be tailored based on writing system, for example -text in the Thai script is automatically delegated to a dictionary-based segmentation -algorithm. -
-- /** - * This tokenizer will work well in general for most languages. - */ - Tokenizer tokenizer = new ICUTokenizer(reader); --
- ICUCollationKeyFilter
- converts each token into its binary CollationKey
using the
- provided Collator
, and then encode the CollationKey
- as a String using
- {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to be
- stored as an index term.
-
- ICUCollationKeyFilter
depends on ICU4J 4.4 to produce the
- CollationKey
s. icu4j-4.4.jar
- is included in Lucene's Subversion repository at contrib/icu/lib/
.
-
- Collator collator = Collator.getInstance(new Locale("ar")); - ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(collator); - RAMDirectory ramDir = new RAMDirectory(); - IndexWriter writer = new IndexWriter - (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); - Document doc = new Document(); - doc.add(new Field("content", "\u0633\u0627\u0628", - Field.Store.YES, Field.Index.ANALYZED)); - writer.addDocument(doc); - writer.close(); - IndexSearcher is = new IndexSearcher(ramDir, true); - - // The AnalyzingQueryParser in Lucene's contrib allows terms in range queries - // to be passed through an analyzer - Lucene's standard QueryParser does not - // allow this. - AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer); - aqp.setLowercaseExpandedTerms(false); - - // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi - // orders the U+0698 character before the U+0633 character, so the single - // indexed Term above should NOT be returned by a ConstantScoreRangeQuery - // with a Farsi Collator (or an Arabic one for the case when Farsi is not - // supported). - ScoreDoc[] result - = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs; - assertEquals("The index Term should not be included.", 0, result.length); -- -
- Analyzer analyzer - = new ICUCollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk"))); - RAMDirectory indexStore = new RAMDirectory(); - IndexWriter writer = new IndexWriter - (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); - String[] tracer = new String[] { "A", "B", "C", "D", "E" }; - String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" }; - String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" }; - for (int i = 0 ; i < data.length ; ++i) { - Document doc = new Document(); - doc.add(new Field("tracer", tracer[i], Field.Store.YES, Field.Index.NO)); - doc.add(new Field("contents", data[i], Field.Store.NO, Field.Index.ANALYZED)); - writer.addDocument(doc); - } - writer.close(); - Searcher searcher = new IndexSearcher(indexStore, true); - Sort sort = new Sort(); - sort.setSort(new SortField("contents", SortField.STRING)); - Query query = new MatchAllDocsQuery(); - ScoreDoc[] result = searcher.search(query, null, 1000, sort).scoreDocs; - for (int i = 0 ; i < result.length ; ++i) { - Document doc = searcher.doc(result[i].doc); - assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]); - } -- -
- Collator collator = Collator.getInstance(new Locale("tr", "TR")); - collator.setStrength(Collator.PRIMARY); - Analyzer analyzer = new ICUCollationKeyAnalyzer(collator); - RAMDirectory ramDir = new RAMDirectory(); - IndexWriter writer = new IndexWriter - (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); - Document doc = new Document(); - doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED)); - writer.addDocument(doc); - writer.close(); - IndexSearcher is = new IndexSearcher(ramDir, true); - QueryParser parser = new QueryParser("contents", analyzer); - Query query = parser.parse("d\u0131gy"); // U+0131: dotless i - ScoreDoc[] result = is.search(query, null, 1000).scoreDocs; - assertEquals("The index Term should be included.", 1, result.length); -- -
- WARNING: Make sure you use exactly the same
- Collator
at index and query time -- CollationKey
s
- are only comparable when produced by
- the same Collator
. Since {@link java.text.RuleBasedCollator}s
- are not independently versioned, it is unsafe to search against stored
- CollationKey
s unless the following are exactly the same (best
- practice is to store this information with the index and check that they
- remain the same at query time):
-
- ICUCollationKeyFilter
uses ICU4J's Collator
, which
- makes its version available, thus allowing collation to be versioned
- independently from the JVM. ICUCollationKeyFilter
is also
- significantly faster and generates significantly shorter keys than
- CollationKeyFilter
. See
- http://site.icu-project.org/charts/collation-icu4j-sun for key
- generation timing and key length comparisons between ICU4J and
- java.text.Collator
over several languages.
-
- CollationKey
s generated by java.text.Collator
s are
- not compatible with those those generated by ICU Collators. Specifically, if
- you use CollationKeyFilter
to generate index terms, do not use
- ICUCollationKeyFilter
on the query side, or vice versa.
-
- ICUNormalizer2Filter
normalizes term text to a
- Unicode Normalization Form, so
- that equivalent
- forms are standardized to a unique form.
-
- /** - * Normalizer2 objects are unmodifiable and immutable. - */ - Normalizer2 normalizer = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE); - /** - * This filter will normalize to NFC. - */ - TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer, normalizer); --
-Default caseless matching, or case-folding is more than just conversion to -lowercase. For example, it handles cases such as the Greek sigma, so that -"ÎάÏοÏ" and "ÎÎΪÎΣ" will match correctly. -
--Case-folding is still only an approximation of the language-specific rules -governing case. If the specific language is known, consider using -ICUCollationKeyFilter and indexing collation keys instead. This implementation -performs the "full" case-folding specified in the Unicode standard, and this -may change the length of the term. For example, the German à is case-folded -to the string 'ss'. -
--Case folding is related to normalization, and as such is coupled with it in -this integration. To perform case-folding, you use normalization with the form -"nfkc_cf" (which is the default). -
-- /** - * This filter will case-fold and normalize to NFKC. - */ - TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer); --
-Search term folding removes distinctions (such as accent marks) between -similar characters. It is useful for a fuzzy or loose search. -
--Search term folding implements many of the foldings specified in -Character Foldings -as a special normalization form. This folding applies NFKC, Case Folding, and -many character foldings recursively. -
-- /** - * This filter will case-fold, remove accents and other distinctions, and - * normalize to NFKC. - */ - TokenStream tokenstream = new ICUFoldingFilter(tokenizer); --
-ICU provides text-transformation functionality via its Transliteration API. This allows -you to transform text in a variety of ways, taking context into account. -
--For more information, see the -User's Guide -and -Rule Tutorial. -
-- /** - * This filter will map Traditional Chinese to Simplified Chinese - */ - TokenStream tokenstream = new ICUTransformFilter(tokenizer, Transliterator.getInstance("Traditional-Simplified")); --
- /** - * This filter will map Serbian Cyrillic to Serbian Latin according to BGN rules - */ - TokenStream tokenstream = new ICUTransformFilter(tokenizer, Transliterator.getInstance("Serbian-Latin/BGN")); --
-This module exists to provide up-to-date Unicode functionality that supports -the most recent version of Unicode (currently 6.0). However, some users who wish -for stronger backwards compatibility can restrict -{@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only -a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}. -
-- /** - * This filter will do NFC normalization, but will ignore any characters that - * did not exist as of Unicode 5.0. Because of the normalization stability policy - * of Unicode, this is an easy way to force normalization to a specific version. - */ - Normalizer2 normalizer = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE); - UnicodeSet set = new UnicodeSet("[:age=5.0:]"); - // see FilteredNormalizer2 docs, the set should be frozen or performance will suffer - set.freeze(); - FilteredNormalizer2 unicode50 = new FilteredNormalizer2(normalizer, set); - TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer, unicode50); -- -