1 package org.apache.lucene.benchmark.byTask.feeds;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.BufferedReader;
22 import java.io.IOException;
23 import java.io.StringReader;
24 import java.text.ParseException;
25 import java.util.Arrays;
26 import java.util.Date;
27 import java.util.HashSet;
28 import java.util.Properties;
30 import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType;
31 import org.apache.lucene.benchmark.byTask.utils.Config;
32 import org.apache.lucene.document.DateTools;
33 import org.apache.lucene.util.LuceneTestCase;
34 import org.apache.lucene.util._TestUtil;
36 public class TrecContentSourceTest extends LuceneTestCase {
38 /** A TrecDocMaker which works on a String and not files. */
39 private static class StringableTrecSource extends TrecContentSource {
41 private String docs = null;
43 public StringableTrecSource(String docs, boolean forever) {
45 this.forever = forever;
49 void openNextFile() throws NoMoreDataException, IOException {
52 throw new NoMoreDataException();
57 reader = new BufferedReader(new StringReader(docs));
61 public void setConfig(Config config) {
62 htmlParser = new DemoHTMLParser();
66 private void assertDocData(DocData dd, String expName, String expTitle,
67 String expBody, Date expDate)
68 throws ParseException {
70 assertEquals(expName, dd.getName());
71 assertEquals(expTitle, dd.getTitle());
72 assertTrue(dd.getBody().indexOf(expBody) != -1);
73 Date date = dd.getDate() != null ? DateTools.stringToDate(dd.getDate()) : null;
74 assertEquals(expDate, date);
77 private void assertNoMoreDataException(StringableTrecSource stdm) throws Exception {
78 boolean thrown = false;
80 stdm.getNextDocData(null);
81 } catch (NoMoreDataException e) {
84 assertTrue("Expecting NoMoreDataException", thrown);
87 public void testOneDocument() throws Exception {
88 String docs = "<DOC>\r\n" +
89 "<DOCNO>TEST-000</DOCNO>\r\n" +
91 "http://lucene.apache.org.trecdocmaker.test\r\n" +
92 "HTTP/1.1 200 OK\r\n" +
93 "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
94 "Server: Apache/1.3.27 (Unix)\r\n" +
95 "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
96 "Content-Length: 614\r\n" +
97 "Connection: close\r\n" +
98 "Content-Type: text/html\r\n" +
104 "TEST-000 title\r\n" +
109 "TEST-000 text\r\n" +
114 StringableTrecSource source = new StringableTrecSource(docs, false);
115 source.setConfig(null);
117 DocData dd = source.getNextDocData(new DocData());
118 assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source
119 .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
121 assertNoMoreDataException(source);
124 public void testTwoDocuments() throws Exception {
125 String docs = "<DOC>\r\n" +
126 "<DOCNO>TEST-000</DOCNO>\r\n" +
128 "http://lucene.apache.org.trecdocmaker.test\r\n" +
129 "HTTP/1.1 200 OK\r\n" +
130 "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
131 "Server: Apache/1.3.27 (Unix)\r\n" +
132 "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
133 "Content-Length: 614\r\n" +
134 "Connection: close\r\n" +
135 "Content-Type: text/html\r\n" +
141 "TEST-000 title\r\n" +
146 "TEST-000 text\r\n" +
152 "<DOCNO>TEST-001</DOCNO>\r\n" +
154 "http://lucene.apache.org.trecdocmaker.test\r\n" +
155 "HTTP/1.1 200 OK\r\n" +
156 "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" +
157 "Server: Apache/1.3.27 (Unix)\r\n" +
158 "Last-Modified: Sun, 11 Jan 2008 08:01:00 GMT\r\n" +
159 "Content-Length: 614\r\n" +
160 "Connection: close\r\n" +
161 "Content-Type: text/html\r\n" +
167 "TEST-001 title\r\n" +
172 "TEST-001 text\r\n" +
177 StringableTrecSource source = new StringableTrecSource(docs, false);
178 source.setConfig(null);
180 DocData dd = source.getNextDocData(new DocData());
181 assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source
182 .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
184 dd = source.getNextDocData(dd);
185 assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source
186 .parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
188 assertNoMoreDataException(source);
191 // If a Date: attribute is missing, make sure the document is not skipped, but
192 // rather that null Data is assigned.
193 public void testMissingDate() throws Exception {
194 String docs = "<DOC>\r\n" +
195 "<DOCNO>TEST-000</DOCNO>\r\n" +
197 "http://lucene.apache.org.trecdocmaker.test\r\n" +
198 "HTTP/1.1 200 OK\r\n" +
199 "Server: Apache/1.3.27 (Unix)\r\n" +
200 "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
201 "Content-Length: 614\r\n" +
202 "Connection: close\r\n" +
203 "Content-Type: text/html\r\n" +
209 "TEST-000 title\r\n" +
214 "TEST-000 text\r\n" +
220 "<DOCNO>TEST-001</DOCNO>\r\n" +
222 "http://lucene.apache.org.trecdocmaker.test\r\n" +
223 "HTTP/1.1 200 OK\r\n" +
224 "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" +
225 "Server: Apache/1.3.27 (Unix)\r\n" +
226 "Last-Modified: Sun, 11 Jan 2009 08:01:00 GMT\r\n" +
227 "Content-Length: 614\r\n" +
228 "Connection: close\r\n" +
229 "Content-Type: text/html\r\n" +
235 "TEST-001 title\r\n" +
240 "TEST-001 text\r\n" +
245 StringableTrecSource source = new StringableTrecSource(docs, false);
246 source.setConfig(null);
248 DocData dd = source.getNextDocData(new DocData());
249 assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
251 dd = source.getNextDocData(dd);
252 assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source
253 .parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
255 assertNoMoreDataException(source);
258 // When a 'bad date' is input (unparsable date), make sure the DocData date is
260 public void testBadDate() throws Exception {
261 String docs = "<DOC>\r\n" +
262 "<DOCNO>TEST-000</DOCNO>\r\n" +
264 "http://lucene.apache.org.trecdocmaker.test\r\n" +
265 "HTTP/1.1 200 OK\r\n" +
266 "Date: Bad Date\r\n" +
267 "Server: Apache/1.3.27 (Unix)\r\n" +
268 "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
269 "Content-Length: 614\r\n" +
270 "Connection: close\r\n" +
271 "Content-Type: text/html\r\n" +
277 "TEST-000 title\r\n" +
282 "TEST-000 text\r\n" +
287 StringableTrecSource source = new StringableTrecSource(docs, false);
288 source.setConfig(null);
290 DocData dd = source.getNextDocData(new DocData());
291 assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
293 assertNoMoreDataException(source);
296 public void testForever() throws Exception {
297 String docs = "<DOC>\r\n" +
298 "<DOCNO>TEST-000</DOCNO>\r\n" +
300 "http://lucene.apache.org.trecdocmaker.test\r\n" +
301 "HTTP/1.1 200 OK\r\n" +
302 "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
303 "Server: Apache/1.3.27 (Unix)\r\n" +
304 "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" +
305 "Content-Length: 614\r\n" +
306 "Connection: close\r\n" +
307 "Content-Type: text/html\r\n" +
313 "TEST-000 title\r\n" +
318 "TEST-000 text\r\n" +
323 StringableTrecSource source = new StringableTrecSource(docs, true);
324 source.setConfig(null);
326 DocData dd = source.getNextDocData(new DocData());
327 assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source
328 .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
330 // same document, but the second iteration changes the name.
331 dd = source.getNextDocData(dd);
332 assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", source
333 .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
335 // Don't test that NoMoreDataException is thrown, since the forever flag is
340 * Open a trec content source over a directory with files of all trec path types and all
341 * supported formats - bzip, gzip, txt.
343 public void testTrecFeedDirAllTypes() throws Exception {
344 File dataDir = _TestUtil.getTempDir("trecFeedAllTypes");
345 _TestUtil.unzip(getDataFile("trecdocs.zip"), dataDir);
346 TrecContentSource tcs = new TrecContentSource();
347 Properties props = new Properties();
348 props.setProperty("print.props", "false");
349 props.setProperty("content.source.verbose", "false");
350 props.setProperty("content.source.excludeIteration", "true");
351 props.setProperty("doc.maker.forever", "false");
352 props.setProperty("docs.dir", dataDir.getCanonicalPath().replace('\\','/'));
353 props.setProperty("trec.doc.parser", TrecParserByPath.class.getName());
354 props.setProperty("content.source.forever", "false");
355 tcs.setConfig(new Config(props));
357 DocData dd = new DocData();
359 boolean gotExpectedException = false;
360 HashSet<ParsePathType> unseenTypes = new HashSet<ParsePathType>(Arrays.asList(ParsePathType.values()));
362 while (n<100) { // arbiterary limit to prevent looping forever in case of test failure
363 dd = tcs.getNextDocData(dd);
365 assertNotNull("doc data "+n+" should not be null!", dd);
366 unseenTypes.remove(tcs.currPathType);
367 switch(tcs.currPathType) {
369 assertDocData(dd, "TEST-000", "TEST-000 title", "TEST-000 text", tcs.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
372 assertDocData(dd, "TEST-001", "TEST-001 Title", "TEST-001 text", tcs.parseDate("1 January 1991"));
375 // no title extraction in this source for now
376 assertDocData(dd, "TEST-002", null, "DEPARTMENT OF SOMETHING", tcs.parseDate("February 3, 1994"));
379 assertDocData(dd, "TEST-003", "Test-003 title", "Some pub text", tcs.parseDate("980424"));
382 assertDocData(dd, "TEST-004", "Test-004 Title", "Some paragraph", tcs.parseDate("January 17, 1997, Sunday"));
385 assertTrue("Should never get here!", false);
388 } catch (NoMoreDataException e) {
389 gotExpectedException = true;
391 assertTrue("Should have gotten NoMoreDataException!", gotExpectedException);
392 assertEquals("Wrong number of documents created by source!",5,n);
393 assertTrue("Did not see all types!",unseenTypes.isEmpty());