pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / analyzers / common / src / test / org / apache / lucene / analysis / de / data.txt
diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/data.txt b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/data.txt

new file mode 100644 (file)

index 0000000..5b8ce5f
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/data.txt
@@ -0,0 +1,48 @@
+# German special characters are replaced:
+häufig        haufig
+
+# here the stemmer works okay, it maps related words to the same stem:
+abschließen   abschliess
+abschließender        abschliess
+abschließendes        abschliess
+abschließenden        abschliess
+
+Tisch  tisch
+Tische tisch
+Tischen        tisch
+
+Haus   hau
+Hauses hau
+Häuser        hau
+Häusern       hau
+# here's a case where overstemming occurs, i.e. a word is 
+# mapped to the same stem as unrelated words:
+hauen  hau
+
+# here's a case where understemming occurs, i.e. two related words
+# are not mapped to the same stem. This is the case with basically
+# all irregular forms:
+Drama  drama
+Dramen dram
+
+# replace "ß" with 'ss':
+Ausmaß        ausmass
+
+# fake words to test if suffixes are cut off:
+xxxxxe xxxxx
+xxxxxs xxxxx
+xxxxxn xxxxx
+xxxxxt xxxxx
+xxxxxem        xxxxx
+xxxxxer        xxxxx
+xxxxxnd        xxxxx
+# the suffixes are also removed when combined:
+xxxxxetende    xxxxx
+
+# words that are shorter than four charcters are not changed:
+xxe    xxe
+# -em and -er are not removed from words shorter than five characters:
+xxem   xxem
+xxer   xxer
+# -nd is not removed from words shorter than six characters:
+xxxnd  xxxnd