2 <project name="benchmark" default="default">
5 Lucene Benchmarking Contributions
8 <import file="../contrib-build.xml"/>
9 <property name="working.dir" location="work"/>
11 <!-- the tests have some parallel problems -->
12 <property name="tests.threadspercpu" value="0"/>
14 <target name="check-files">
15 <available file="temp/news20.tar.gz" property="news20.exists"/>
17 <available file="${working.dir}/20_newsgroup" property="news20.expanded"/>
19 <available file="temp/reuters21578.tar.gz" property="reuters.exists"/>
20 <available file="${working.dir}/reuters" property="reuters.expanded"/>
21 <available file="${working.dir}/reuters-out" property="reuters.extracted"/>
22 <available file="temp/20news-18828.tar.gz" property="20news-18828.exists"/>
23 <available file="${working.dir}/20news-18828" property="20news-18828.expanded"/>
24 <available file="${working.dir}/mini_newsgroups" property="mini.expanded"/>
26 <available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
27 <available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
28 <available file="${working.dir}/enwiki.txt" property="enwiki.extracted"/>
29 <available file="temp/${top.100k.words.archive.filename}"
30 property="top.100k.words.archive.present"/>
31 <available file="${working.dir}/top100k-out"
32 property="top.100k.word.files.expanded"/>
35 <target name="enwiki-files" depends="check-files">
37 <antcall target="get-enwiki"/>
38 <antcall target="expand-enwiki"/>
41 <target name="get-enwiki" unless="enwiki.exists">
42 <get src="http://people.apache.org/~gsingers/wikipedia/enwiki-20070527-pages-articles.xml.bz2"
43 dest="temp/enwiki-20070527-pages-articles.xml.bz2"/>
46 <target name="expand-enwiki" unless="enwiki.expanded">
47 <bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
50 <target name="get-news-20" unless="20news-18828.exists">
51 <get src="http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz"
52 dest="temp/news20.tar.gz"/>
55 <target name="get-reuters" unless="reuters.exists">
57 <get src="http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz"
58 dest="temp/reuters21578.tar.gz"/>
61 <target name="expand-news-20" unless="news20.expanded">
62 <gunzip src="temp/news20.tar.gz" dest="temp"/>
63 <untar src="temp/news20.tar" dest="${working.dir}"/>
65 <target name="expand-reuters" unless="reuters.expanded">
66 <gunzip src="temp/reuters21578.tar.gz" dest="temp"/>
67 <mkdir dir="${working.dir}/reuters"/>
68 <untar src="temp/reuters21578.tar" dest="${working.dir}/reuters"/>
70 <fileset dir="${working.dir}/reuters">
71 <include name="*.txt"/>
76 <target name="extract-reuters" depends="check-files" unless="reuters.extracted">
77 <java classname="org.apache.lucene.benchmark.utils.ExtractReuters" maxmemory="1024M" fork="true">
78 <classpath refid="run.classpath"/>
79 <arg file="${working.dir}/reuters"/>
80 <arg file="${working.dir}/reuters-out"/>
83 <target name="get-20news-18828" unless="20news-18828.exists">
84 <get src="http://people.csail.mit.edu/u/j/jrennie/public_html/20Newsgroups/20news-18828.tar.gz"
85 dest="temp/20news-18828.tar.gz"/>
88 <target name="expand-20news-18828" unless="20news-18828.expanded">
89 <gunzip src="temp/20news-18828.tar.gz" dest="temp"/>
90 <untar src="temp/20news-18828.tar" dest="${working.dir}"/>
92 <target name="get-mini-news" unless="mini.exists">
93 <get src="http://kdd.ics.uci.edu/databases/20newsgroups/mini_newsgroups.tar.gz"
94 dest="temp/mini_newsgroups.tar.gz"/>
96 <target name="expand-mini-news" unless="mini.expanded">
97 <gunzip src="temp/mini_newsgroups.tar.gz" dest="temp"/>
98 <untar src="temp/mini_newsgroups.tar" dest="${working.dir}"/>
101 <property name="top.100k.words.archive.filename"
102 value="top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"/>
103 <property name="top.100k.words.archive.base.url"
104 value="http://people.apache.org/~rmuir/wikipedia"/>
105 <target name="get-top-100k-words-archive" unless="top.100k.words.archive.present">
107 <get src="${top.100k.words.archive.base.url}/${top.100k.words.archive.filename}"
108 dest="temp/${top.100k.words.archive.filename}"/>
110 <target name="expand-top-100k-word-files" unless="top.100k.word.files.expanded">
111 <mkdir dir="${working.dir}/top100k-out"/>
112 <untar src="temp/${top.100k.words.archive.filename}"
113 overwrite="true" compression="bzip2" dest="${working.dir}/top100k-out"/>
116 <target name="top-100k-wiki-word-files" depends="check-files">
117 <mkdir dir="${working.dir}"/>
118 <antcall target="get-top-100k-words-archive"/>
119 <antcall target="expand-top-100k-word-files"/>
122 <target name="get-files" depends="check-files">
124 <antcall target="get-reuters"/>
125 <antcall target="expand-reuters"/>
126 <antcall target="extract-reuters"/>
129 <path id="classpath">
130 <pathelement path="${memory.jar}"/>
131 <pathelement path="${highlighter.jar}"/>
132 <pathelement path="${analyzers-common.jar}"/>
133 <pathelement path="${facet.jar}"/>
134 <path refid="base.classpath"/>
136 <include name="**/*.jar"/>
139 <path id="run.classpath">
140 <path refid="classpath"/>
141 <pathelement location="${build.dir}/classes/java"/>
142 <pathelement path="${benchmark.ext.classpath}"/>
145 <property name="task.alg" location="conf/micro-standard.alg"/>
146 <property name="task.mem" value="140M"/>
148 <target name="run-task" depends="compile,check-files,get-files"
149 description="Run compound penalty perf test (optional: -Dtask.alg=your-algorithm-file -Dtask.mem=java-max-mem)">
150 <echo>Working Directory: ${working.dir}</echo>
151 <java classname="org.apache.lucene.benchmark.byTask.Benchmark" maxmemory="${task.mem}" fork="true">
152 <classpath refid="run.classpath"/>
153 <arg file="${task.alg}"/>
157 <target name="enwiki" depends="compile,check-files,enwiki-files">
158 <echo>Working Directory: ${working.dir}</echo>
159 <java classname="org.apache.lucene.benchmark.byTask.Benchmark" maxmemory="1024M" fork="true">
163 <classpath refid="run.classpath"/>
164 <arg file="conf/extractWikipedia.alg"/>
168 <property name="collation.alg.file" location="conf/collation.alg"/>
169 <property name="collation.output.file"
170 value="${working.dir}/collation.benchmark.output.txt"/>
171 <property name="collation.jira.output.file"
172 value="${working.dir}/collation.bm2jira.output.txt"/>
174 <path id="collation.runtime.classpath">
175 <path refid="run.classpath"/>
176 <pathelement path="${icu.jar}"/>
177 <fileset dir="${common.dir}/contrib/icu/lib" includes="icu4j*.jar"/>
180 <target name="collation" depends="compile,jar-icu,top-100k-wiki-word-files">
181 <echo>Running contrib/benchmark with alg file: ${collation.alg.file}</echo>
182 <java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark"
183 maxmemory="${task.mem}" output="${collation.output.file}">
184 <classpath refid="collation.runtime.classpath"/>
185 <arg file="${collation.alg.file}"/>
187 <echo>Benchmark output is in file: ${collation.output.file}</echo>
188 <echo>Converting to JIRA table format...</echo>
189 <exec executable="perl" output="${collation.jira.output.file}" failonerror="true">
190 <arg value="scripts/collation.bm2jira.pl"/>
191 <arg value="${collation.output.file}"/>
193 <echo>Benchmark output in JIRA table format is in file: ${collation.jira.output.file}</echo>
196 <property name="shingle.alg.file" location="conf/shingle.alg"/>
197 <property name="shingle.output.file"
198 value="${working.dir}/shingle.benchmark.output.txt"/>
199 <property name="shingle.jira.output.file"
200 value="${working.dir}/shingle.bm2jira.output.txt"/>
202 <path id="shingle.runtime.classpath">
203 <path refid="run.classpath"/>
206 <target name="shingle" depends="compile,get-files">
207 <echo>Running contrib/benchmark with alg file: ${shingle.alg.file}</echo>
208 <java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark"
209 maxmemory="${task.mem}" output="${shingle.output.file}">
210 <classpath refid="run.classpath"/>
211 <arg file="${shingle.alg.file}"/>
213 <echo>Benchmark output is in file: ${shingle.output.file}</echo>
214 <echo>Converting to JIRA table format...</echo>
215 <exec executable="perl" output="${shingle.jira.output.file}" failonerror="true">
216 <arg value="scripts/shingle.bm2jira.pl"/>
217 <arg value="${shingle.output.file}"/>
219 <echo>Benchmark output in JIRA table format is in file: ${shingle.jira.output.file}</echo>
222 <target name="init" depends="contrib-build.init,jar-memory,jar-highlighter,jar-analyzers-common,jar-facet"/>
224 <target name="clean-javacc">
225 <fileset dir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml" includes="*.java">
226 <containsregexp expression="Generated.*By.*JavaCC"/>
230 <target name="javacc" depends="init,javacc-check" if="javacc.present">
231 <invoke-javacc target="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj"
232 outputDir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml"
236 <target name="dist-maven" depends="contrib-build.dist-maven">
237 <m2-deploy-with-pom-template pom.xml="lib/lucene-xercesImpl-pom.xml.template"
238 jar.file="lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar" />