Configured build for Ubuntu and added Stempel polish analyzer
[pylucene.git] / lucene-java-3.4.0 / lucene / contrib / benchmark / build.xml
1 <?xml version="1.0"?>
2 <project name="benchmark" default="default">
3
4     <description>
5         Lucene Benchmarking Contributions
6     </description>
7
8     <import file="../contrib-build.xml"/>
9     <property name="working.dir" location="work"/>
10
11     <!-- the tests have some parallel problems -->
12     <property name="tests.threadspercpu" value="0"/>
13
14     <target name="check-files">
15         <available file="temp/news20.tar.gz" property="news20.exists"/>
16
17         <available file="${working.dir}/20_newsgroup" property="news20.expanded"/>
18
19         <available file="temp/reuters21578.tar.gz" property="reuters.exists"/>
20         <available file="${working.dir}/reuters" property="reuters.expanded"/>
21         <available file="${working.dir}/reuters-out" property="reuters.extracted"/>
22         <available file="temp/20news-18828.tar.gz" property="20news-18828.exists"/>
23         <available file="${working.dir}/20news-18828" property="20news-18828.expanded"/>
24         <available file="${working.dir}/mini_newsgroups" property="mini.expanded"/>
25         
26         <available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
27         <available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
28         <available file="${working.dir}/enwiki.txt" property="enwiki.extracted"/>
29         <available file="temp/${top.100k.words.archive.filename}"
30                    property="top.100k.words.archive.present"/>
31         <available file="${working.dir}/top100k-out" 
32                    property="top.100k.word.files.expanded"/>
33     </target>
34
35     <target name="enwiki-files" depends="check-files">
36         <mkdir dir="temp"/>
37         <antcall target="get-enwiki"/>
38         <antcall target="expand-enwiki"/>
39     </target>
40
41     <target name="get-enwiki" unless="enwiki.exists">
42         <get src="http://people.apache.org/~gsingers/wikipedia/enwiki-20070527-pages-articles.xml.bz2"
43              dest="temp/enwiki-20070527-pages-articles.xml.bz2"/>
44     </target>
45
46     <target name="expand-enwiki"  unless="enwiki.expanded">
47         <bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
48     </target>
49
50     <target name="get-news-20" unless="20news-18828.exists">
51         <get src="http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz"
52              dest="temp/news20.tar.gz"/>
53
54     </target>
55     <target name="get-reuters" unless="reuters.exists">
56
57         <get src="http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz"
58             dest="temp/reuters21578.tar.gz"/>
59     </target>
60
61     <target name="expand-news-20"  unless="news20.expanded">
62         <gunzip src="temp/news20.tar.gz" dest="temp"/>
63         <untar src="temp/news20.tar" dest="${working.dir}"/>
64     </target>
65     <target name="expand-reuters" unless="reuters.expanded">
66         <gunzip src="temp/reuters21578.tar.gz" dest="temp"/>
67         <mkdir dir="${working.dir}/reuters"/>
68         <untar src="temp/reuters21578.tar" dest="${working.dir}/reuters"/>
69         <delete >
70             <fileset dir="${working.dir}/reuters">
71                 <include name="*.txt"/>
72             </fileset>
73         </delete>
74
75     </target>
76     <target name="extract-reuters" depends="check-files" unless="reuters.extracted">
77         <java classname="org.apache.lucene.benchmark.utils.ExtractReuters" maxmemory="1024M" fork="true">
78             <classpath refid="run.classpath"/>
79             <arg file="${working.dir}/reuters"/>
80             <arg file="${working.dir}/reuters-out"/>
81         </java>
82     </target>
83     <target name="get-20news-18828" unless="20news-18828.exists">
84         <get src="http://people.csail.mit.edu/u/j/jrennie/public_html/20Newsgroups/20news-18828.tar.gz"
85              dest="temp/20news-18828.tar.gz"/>
86
87     </target>
88     <target name="expand-20news-18828" unless="20news-18828.expanded">
89         <gunzip src="temp/20news-18828.tar.gz" dest="temp"/>
90         <untar src="temp/20news-18828.tar" dest="${working.dir}"/>
91     </target>
92     <target name="get-mini-news" unless="mini.exists">
93         <get src="http://kdd.ics.uci.edu/databases/20newsgroups/mini_newsgroups.tar.gz"
94              dest="temp/mini_newsgroups.tar.gz"/>
95     </target>
96     <target name="expand-mini-news" unless="mini.expanded">
97         <gunzip src="temp/mini_newsgroups.tar.gz" dest="temp"/>
98         <untar src="temp/mini_newsgroups.tar" dest="${working.dir}"/>
99     </target>
100
101         <property name="top.100k.words.archive.filename" 
102                   value="top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"/>
103         <property name="top.100k.words.archive.base.url"
104                   value="http://people.apache.org/~rmuir/wikipedia"/>
105         <target name="get-top-100k-words-archive" unless="top.100k.words.archive.present">
106                 <mkdir dir="temp"/>
107             <get src="${top.100k.words.archive.base.url}/${top.100k.words.archive.filename}"
108                  dest="temp/${top.100k.words.archive.filename}"/>
109         </target>
110         <target name="expand-top-100k-word-files" unless="top.100k.word.files.expanded">
111                 <mkdir dir="${working.dir}/top100k-out"/>
112             <untar src="temp/${top.100k.words.archive.filename}"
113                    overwrite="true" compression="bzip2" dest="${working.dir}/top100k-out"/>
114         </target>
115         
116         <target name="top-100k-wiki-word-files" depends="check-files">
117           <mkdir dir="${working.dir}"/>
118           <antcall target="get-top-100k-words-archive"/>
119           <antcall target="expand-top-100k-word-files"/>
120         </target>
121         
122     <target name="get-files" depends="check-files">
123         <mkdir dir="temp"/>
124         <antcall target="get-reuters"/>
125         <antcall target="expand-reuters"/>
126         <antcall target="extract-reuters"/>
127     </target>
128
129     <path id="classpath">
130       <pathelement path="${memory.jar}"/>
131       <pathelement path="${highlighter.jar}"/>
132       <pathelement path="${analyzers-common.jar}"/>
133       <path refid="base.classpath"/>
134         <fileset dir="lib">
135                 <include name="**/*.jar"/>
136         </fileset>
137     </path>
138     <path id="run.classpath">
139         <path refid="classpath"/>
140         <pathelement location="${build.dir}/classes/java"/>
141         <pathelement path="${benchmark.ext.classpath}"/>
142     </path>
143
144     <property name="task.alg" location="conf/micro-standard.alg"/>
145     <property name="task.mem" value="140M"/>
146
147     <target name="run-task" depends="compile,check-files,get-files" 
148      description="Run compound penalty perf test (optional: -Dtask.alg=your-algorithm-file -Dtask.mem=java-max-mem)">
149         <echo>Working Directory: ${working.dir}</echo>
150         <java classname="org.apache.lucene.benchmark.byTask.Benchmark" maxmemory="${task.mem}" fork="true">
151             <classpath refid="run.classpath"/>
152             <arg file="${task.alg}"/>
153         </java>
154     </target>
155
156     <target name="enwiki" depends="compile,check-files,enwiki-files">
157         <echo>Working Directory: ${working.dir}</echo>
158         <java classname="org.apache.lucene.benchmark.byTask.Benchmark" maxmemory="1024M" fork="true">
159             <assertions>
160               <enable/>
161             </assertions>
162             <classpath refid="run.classpath"/>
163             <arg file="conf/extractWikipedia.alg"/>
164         </java>
165     </target>
166
167         <property name="collation.alg.file" location="conf/collation.alg"/>
168         <property name="collation.output.file" 
169                   value="${working.dir}/collation.benchmark.output.txt"/>
170         <property name="collation.jira.output.file" 
171                   value="${working.dir}/collation.bm2jira.output.txt"/>
172         
173         <path id="collation.runtime.classpath">
174           <path refid="run.classpath"/>
175     <pathelement path="${icu.jar}"/>
176     <fileset dir="${common.dir}/contrib/icu/lib" includes="icu4j*.jar"/>
177         </path>
178         
179         <target name="collation" depends="compile,jar-icu,top-100k-wiki-word-files">
180             <echo>Running contrib/benchmark with alg file: ${collation.alg.file}</echo>
181             <java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark" 
182                   maxmemory="${task.mem}" output="${collation.output.file}">
183               <classpath refid="collation.runtime.classpath"/>
184               <arg file="${collation.alg.file}"/>
185             </java>
186             <echo>Benchmark output is in file: ${collation.output.file}</echo>
187             <echo>Converting to JIRA table format...</echo>
188             <exec executable="perl" output="${collation.jira.output.file}" failonerror="true">
189               <arg value="scripts/collation.bm2jira.pl"/>
190               <arg value="${collation.output.file}"/>
191             </exec>
192             <echo>Benchmark output in JIRA table format is in file: ${collation.jira.output.file}</echo>
193         </target>
194         
195     <property name="shingle.alg.file" location="conf/shingle.alg"/>
196     <property name="shingle.output.file" 
197               value="${working.dir}/shingle.benchmark.output.txt"/>
198     <property name="shingle.jira.output.file" 
199               value="${working.dir}/shingle.bm2jira.output.txt"/>
200         
201     <path id="shingle.runtime.classpath">
202       <path refid="run.classpath"/>
203     </path>
204         
205     <target name="shingle" depends="compile,get-files">
206       <echo>Running contrib/benchmark with alg file: ${shingle.alg.file}</echo>
207       <java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark" 
208             maxmemory="${task.mem}" output="${shingle.output.file}">
209         <classpath refid="run.classpath"/>
210         <arg file="${shingle.alg.file}"/>
211       </java>
212       <echo>Benchmark output is in file: ${shingle.output.file}</echo>
213       <echo>Converting to JIRA table format...</echo>
214       <exec executable="perl" output="${shingle.jira.output.file}" failonerror="true">
215         <arg value="scripts/shingle.bm2jira.pl"/>
216         <arg value="${shingle.output.file}"/>
217       </exec>
218       <echo>Benchmark output in JIRA table format is in file: ${shingle.jira.output.file}</echo>
219     </target>
220
221     <target name="init" depends="contrib-build.init,jar-memory,jar-highlighter,jar-analyzers-common"/>
222   
223     <target name="clean-javacc">
224       <fileset dir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml" includes="*.java">
225         <containsregexp expression="Generated.*By.*JavaCC"/>
226       </fileset>
227     </target>
228     
229     <target name="javacc" depends="init,javacc-check" if="javacc.present">
230       <invoke-javacc target="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj"
231                      outputDir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml"
232                      />
233     </target>
234
235   <target name="dist-maven" depends="contrib-build.dist-maven">
236     <m2-deploy-with-pom-template pom.xml="lib/lucene-xercesImpl-pom.xml.template"
237                                  jar.file="lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar" />
238   </target>
239 </project>