X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/package.html
diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/package.html b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/package.html
new file mode 100644
index 0000000..d19dbe5
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/package.html
@@ -0,0 +1,200 @@
+
+
+
+
+CompoundWordTokenFilter
+
+
+
+A filter that decomposes compound words you find in many Germanic
+languages into the word parts. This example shows what it does:
+
+
+ Input token stream |
+
+
+ Rindfleischüberwachungsgesetz Drahtschere abba |
+
+
+
+
+
+ Output token stream |
+
+
+ (Rindfleischüberwachungsgesetz,0,29) |
+
+
+ (Rind,0,4,posIncr=0) |
+
+
+ (fleisch,4,11,posIncr=0) |
+
+
+ (überwachung,11,22,posIncr=0) |
+
+
+ (gesetz,23,29,posIncr=0) |
+
+
+ (Drahtschere,30,41) |
+
+
+ (Draht,30,35,posIncr=0) |
+
+
+ (schere,35,41,posIncr=0) |
+
+
+ (abba,42,46) |
+
+
+
+The input token is always preserved and the filters do not alter the case of word parts. There are two variants of the
+filter available:
+
+ - HyphenationCompoundWordTokenFilter: it uses a
+ hyphenation grammar based approach to find potential word parts of a
+ given word.
+ - DictionaryCompoundWordTokenFilter: it uses a
+ brute-force dictionary-only based approach to find the word parts of a given
+ word.
+
+
+Compound word token filters
+HyphenationCompoundWordTokenFilter
+The {@link
+org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter
+HyphenationCompoundWordTokenFilter} uses hyphenation grammars to find
+potential subwords that a worth to check against the dictionary. It can be used
+without a dictionary as well but then produces a lot of "nonword" tokens.
+The quality of the output tokens is directly connected to the quality of the
+grammar file you use. For languages like German they are quite good.
+Grammar file
+Unfortunately we cannot bundle the hyphenation grammar files with Lucene
+because they do not use an ASF compatible license (they use the LaTeX
+Project Public License instead). You can find the XML based grammar
+files at the
+Objects
+For Formatting Objects
+(OFFO) Sourceforge project (direct link to download the pattern files:
+http://downloads.sourceforge.net/offo/offo-hyphenation.zip
+). The files you need are in the subfolder
+offo-hyphenation/hyph/
+.
+
+Credits for the hyphenation code go to the
+Apache FOP project
+.
+
+DictionaryCompoundWordTokenFilter
+The {@link
+org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter
+DictionaryCompoundWordTokenFilter} uses a dictionary-only approach to
+find subwords in a compound word. It is much slower than the one that
+uses the hyphenation grammars. You can use it as a first start to
+see if your dictionary is good or not because it is much simpler in design.
+
+Dictionary
+The output quality of both token filters is directly connected to the
+quality of the dictionary you use. They are language dependent of course.
+You always should use a dictionary
+that fits to the text you want to index. If you index medical text for
+example then you should use a dictionary that contains medical words.
+A good start for general text are the dictionaries you find at the
+OpenOffice
+dictionaries
+Wiki.
+
+Which variant should I use?
+This decision matrix should help you:
+
+
+ Token filter |
+ Output quality |
+ Performance |
+
+
+ HyphenationCompoundWordTokenFilter |
+ good if grammar file is good – acceptable otherwise |
+ fast |
+
+
+ DictionaryCompoundWordTokenFilter |
+ good |
+ slow |
+
+
+Examples
+
+ public void testHyphenationCompoundWordsDE() throws Exception {
+ String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
+ "Aufgabe", "Überwachung" };
+
+ Reader reader = new FileReader("de_DR.xml");
+
+ HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
+ .getHyphenationTree(reader);
+
+ HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
+ new WhitespaceTokenizer(new StringReader(
+ "Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator,
+ dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+ CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+
+ CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
+ while (tf.incrementToken()) {
+ System.out.println(t);
+ }
+ }
+
+ public void testHyphenationCompoundWordsWithoutDictionaryDE() throws Exception {
+ Reader reader = new FileReader("de_DR.xml");
+
+ HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
+ .getHyphenationTree(reader);
+
+ HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
+ new WhitespaceTokenizer(new StringReader(
+ "Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator);
+
+ CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
+ while (tf.incrementToken()) {
+ System.out.println(t);
+ }
+ }
+
+ public void testDumbCompoundWordsSE() throws Exception {
+ String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
+ "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
+ "Sko", "Vind", "Rute", "Torkare", "Blad" };
+
+ DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
+ new WhitespaceTokenizer(
+ new StringReader(
+ "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
+ dict);
+ CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
+ while (tf.incrementToken()) {
+ System.out.println(t);
+ }
+ }
+
+
+
\ No newline at end of file