X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/collation/package.html diff --git a/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/collation/package.html b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/collation/package.html new file mode 100644 index 0000000..7792b5e --- /dev/null +++ b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/collation/package.html @@ -0,0 +1,176 @@ + + + +
+
+ CollationKeyFilter
+ converts each token into its binary CollationKey
using the
+ provided Collator
, and then encode the CollationKey
+ as a String using
+ {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to be
+ stored as an index term.
+
+ // "fa" Locale is not supported by Sun JDK 1.4 or 1.5 + Collator collator = Collator.getInstance(new Locale("ar")); + CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(collator); + RAMDirectory ramDir = new RAMDirectory(); + IndexWriter writer = new IndexWriter + (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + doc.add(new Field("content", "\u0633\u0627\u0628", + Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + writer.close(); + IndexSearcher is = new IndexSearcher(ramDir, true); + + // The AnalyzingQueryParser in Lucene's contrib allows terms in range queries + // to be passed through an analyzer - Lucene's standard QueryParser does not + // allow this. + AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer); + aqp.setLowercaseExpandedTerms(false); + + // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi + // orders the U+0698 character before the U+0633 character, so the single + // indexed Term above should NOT be returned by a ConstantScoreRangeQuery + // with a Farsi Collator (or an Arabic one for the case when Farsi is not + // supported). + ScoreDoc[] result + = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs; + assertEquals("The index Term should not be included.", 0, result.length); ++ +
+ Analyzer analyzer + = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk"))); + RAMDirectory indexStore = new RAMDirectory(); + IndexWriter writer = new IndexWriter + (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); + String[] tracer = new String[] { "A", "B", "C", "D", "E" }; + String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" }; + String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" }; + for (int i = 0 ; i < data.length ; ++i) { + Document doc = new Document(); + doc.add(new Field("tracer", tracer[i], Field.Store.YES, Field.Index.NO)); + doc.add(new Field("contents", data[i], Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + writer.close(); + Searcher searcher = new IndexSearcher(indexStore, true); + Sort sort = new Sort(); + sort.setSort(new SortField("contents", SortField.STRING)); + Query query = new MatchAllDocsQuery(); + ScoreDoc[] result = searcher.search(query, null, 1000, sort).scoreDocs; + for (int i = 0 ; i < result.length ; ++i) { + Document doc = searcher.doc(result[i].doc); + assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]); + } ++ +
+ Collator collator = Collator.getInstance(new Locale("tr", "TR")); + collator.setStrength(Collator.PRIMARY); + Analyzer analyzer = new CollationKeyAnalyzer(collator); + RAMDirectory ramDir = new RAMDirectory(); + IndexWriter writer = new IndexWriter + (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); + Document doc = new Document(); + doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + writer.close(); + IndexSearcher is = new IndexSearcher(ramDir, true); + QueryParser parser = new QueryParser("contents", analyzer); + Query query = parser.parse("d\u0131gy"); // U+0131: dotless i + ScoreDoc[] result = is.search(query, null, 1000).scoreDocs; + assertEquals("The index Term should be included.", 1, result.length); ++ +
+ WARNING: Make sure you use exactly the same
+ Collator
at index and query time -- CollationKey
s
+ are only comparable when produced by
+ the same Collator
. Since {@link java.text.RuleBasedCollator}s
+ are not independently versioned, it is unsafe to search against stored
+ CollationKey
s unless the following are exactly the same (best
+ practice is to store this information with the index and check that they
+ remain the same at query time):
+
+ ICUCollationKeyFilter
, available in the icu package in Lucene's contrib area,
+ uses ICU4J's Collator
, which
+ makes its version available, thus allowing collation to be versioned
+ independently from the JVM. ICUCollationKeyFilter
is also
+ significantly faster and generates significantly shorter keys than
+ CollationKeyFilter
. See
+ http://site.icu-project.org/charts/collation-icu4j-sun for key
+ generation timing and key length comparisons between ICU4J and
+ java.text.Collator
over several languages.
+
+ CollationKey
s generated by java.text.Collator
s are
+ not compatible with those those generated by ICU Collators. Specifically, if
+ you use CollationKeyFilter
to generate index terms, do not use
+ ICUCollationKeyFilter
on the query side, or vice versa.
+
++ +