pylucene 3.5.0-3
[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / benchmark / src / test / org / apache / lucene / benchmark / byTask / feeds / TrecContentSourceTest.java
diff --git a/lucene-java-3.5.0/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java b/lucene-java-3.5.0/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java
new file mode 100644 (file)
index 0000000..98ac614
--- /dev/null
@@ -0,0 +1,396 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.StringReader;
+import java.text.ParseException;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.Properties;
+
+import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType;
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.document.DateTools;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+public class TrecContentSourceTest extends LuceneTestCase {
+
+  /** A TrecDocMaker which works on a String and not files. */
+  private static class StringableTrecSource extends TrecContentSource {
+  
+    private String docs = null;
+    
+    public StringableTrecSource(String docs, boolean forever) {
+      this.docs = docs;
+      this.forever = forever;
+    }
+    
+    @Override
+    void openNextFile() throws NoMoreDataException, IOException {
+      if (reader != null) {
+        if (!forever) {
+          throw new NoMoreDataException();
+        }
+        ++iteration;
+      }
+      
+      reader = new BufferedReader(new StringReader(docs));
+    }
+    
+    @Override
+    public void setConfig(Config config) {
+      htmlParser = new DemoHTMLParser();
+    }
+  }
+  
+  private void assertDocData(DocData dd, String expName, String expTitle,
+                             String expBody, Date expDate)
+      throws ParseException {
+    assertNotNull(dd);
+    assertEquals(expName, dd.getName());
+    assertEquals(expTitle, dd.getTitle());
+    assertTrue(dd.getBody().indexOf(expBody) != -1);
+    Date date = dd.getDate() != null ? DateTools.stringToDate(dd.getDate()) : null;
+    assertEquals(expDate, date);
+  }
+  
+  private void assertNoMoreDataException(StringableTrecSource stdm) throws Exception {
+    boolean thrown = false;
+    try {
+      stdm.getNextDocData(null);
+    } catch (NoMoreDataException e) {
+      thrown = true;
+    }
+    assertTrue("Expecting NoMoreDataException", thrown);
+  }
+  
+  public void testOneDocument() throws Exception {
+    String docs = "<DOC>\r\n" + 
+                  "<DOCNO>TEST-000</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-000 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-000 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>";
+    StringableTrecSource source = new StringableTrecSource(docs, false);
+    source.setConfig(null);
+
+    DocData dd = source.getNextDocData(new DocData());
+    assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source
+        .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+    
+    assertNoMoreDataException(source);
+  }
+  
+  public void testTwoDocuments() throws Exception {
+    String docs = "<DOC>\r\n" + 
+                  "<DOCNO>TEST-000</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-000 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-000 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>\r\n" +
+                  "<DOC>\r\n" + 
+                  "<DOCNO>TEST-001</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2008 08:01:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-001 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-001 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>";
+    StringableTrecSource source = new StringableTrecSource(docs, false);
+    source.setConfig(null);
+
+    DocData dd = source.getNextDocData(new DocData());
+    assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source
+        .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+    
+    dd = source.getNextDocData(dd);
+    assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source
+        .parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
+    
+    assertNoMoreDataException(source);
+  }
+
+  // If a Date: attribute is missing, make sure the document is not skipped, but
+  // rather that null Data is assigned.
+  public void testMissingDate() throws Exception {
+    String docs = "<DOC>\r\n" + 
+                  "<DOCNO>TEST-000</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-000 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-000 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>\r\n" +
+                  "<DOC>\r\n" + 
+                  "<DOCNO>TEST-001</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:01:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-001 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-001 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>";
+    StringableTrecSource source = new StringableTrecSource(docs, false);
+    source.setConfig(null);
+
+    DocData dd = source.getNextDocData(new DocData());
+    assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
+    
+    dd = source.getNextDocData(dd);
+    assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source
+        .parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
+    
+    assertNoMoreDataException(source);
+  }
+
+  // When a 'bad date' is input (unparsable date), make sure the DocData date is
+  // assigned null.
+  public void testBadDate() throws Exception {
+    String docs = "<DOC>\r\n" + 
+                  "<DOCNO>TEST-000</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Bad Date\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-000 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-000 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>";
+    StringableTrecSource source = new StringableTrecSource(docs, false);
+    source.setConfig(null);
+
+    DocData dd = source.getNextDocData(new DocData());
+    assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
+    
+    assertNoMoreDataException(source);
+  }
+
+  public void testForever() throws Exception {
+    String docs = "<DOC>\r\n" + 
+                  "<DOCNO>TEST-000</DOCNO>\r\n" + 
+                  "<DOCHDR>\r\n" + 
+                  "http://lucene.apache.org.trecdocmaker.test\r\n" + 
+                  "HTTP/1.1 200 OK\r\n" + 
+                  "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Server: Apache/1.3.27 (Unix)\r\n" + 
+                  "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + 
+                  "Content-Length: 614\r\n" + 
+                  "Connection: close\r\n" + 
+                  "Content-Type: text/html\r\n" + 
+                  "</DOCHDR>\r\n" + 
+                  "<html>\r\n" + 
+                  "\r\n" + 
+                  "<head>\r\n" + 
+                  "<title>\r\n" + 
+                  "TEST-000 title\r\n" + 
+                  "</title>\r\n" + 
+                  "</head>\r\n" + 
+                  "\r\n" + 
+                  "<body>\r\n" + 
+                  "TEST-000 text\r\n" + 
+                  "\r\n" + 
+                  "</body>\r\n" + 
+                  "\r\n" + 
+                  "</DOC>";
+    StringableTrecSource source = new StringableTrecSource(docs, true);
+    source.setConfig(null);
+
+    DocData dd = source.getNextDocData(new DocData());
+    assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source
+        .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+    
+    // same document, but the second iteration changes the name.
+    dd = source.getNextDocData(dd);
+    assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", source
+        .parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+
+    // Don't test that NoMoreDataException is thrown, since the forever flag is
+    // turned on.
+  }
+  
+  /** 
+   * Open a trec content source over a directory with files of all trec path types and all
+   * supported formats - bzip, gzip, txt. 
+   */
+  public void testTrecFeedDirAllTypes() throws Exception {
+    File dataDir =  _TestUtil.getTempDir("trecFeedAllTypes");
+    _TestUtil.unzip(getDataFile("trecdocs.zip"), dataDir);
+    TrecContentSource tcs = new TrecContentSource();
+    Properties props = new Properties();
+    props.setProperty("print.props", "false");
+    props.setProperty("content.source.verbose", "false");
+    props.setProperty("content.source.excludeIteration", "true");
+    props.setProperty("doc.maker.forever", "false");
+    props.setProperty("docs.dir", dataDir.getCanonicalPath().replace('\\','/')); 
+    props.setProperty("trec.doc.parser", TrecParserByPath.class.getName());
+    props.setProperty("content.source.forever", "false");
+    tcs.setConfig(new Config(props));
+    tcs.resetInputs();
+    DocData dd = new DocData();
+    int n = 0;
+    boolean gotExpectedException = false;
+    HashSet<ParsePathType> unseenTypes = new HashSet<ParsePathType>(Arrays.asList(ParsePathType.values()));
+    try {
+      while (n<100) { // arbiterary limit to prevent looping forever in case of test failure
+        dd = tcs.getNextDocData(dd);
+        ++n;
+        assertNotNull("doc data "+n+" should not be null!", dd);
+        unseenTypes.remove(tcs.currPathType);
+        switch(tcs.currPathType) {
+          case GOV2:
+            assertDocData(dd, "TEST-000", "TEST-000 title", "TEST-000 text", tcs.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
+            break;
+          case FBIS:
+            assertDocData(dd, "TEST-001", "TEST-001 Title", "TEST-001 text", tcs.parseDate("1 January 1991"));
+            break;
+          case FR94:
+            // no title extraction in this source for now
+            assertDocData(dd, "TEST-002", null, "DEPARTMENT OF SOMETHING", tcs.parseDate("February 3, 1994"));
+            break;
+          case FT:
+            assertDocData(dd, "TEST-003", "Test-003 title", "Some pub text", tcs.parseDate("980424"));
+            break;
+          case LATIMES:
+            assertDocData(dd, "TEST-004", "Test-004 Title", "Some paragraph", tcs.parseDate("January 17, 1997, Sunday"));
+            break;
+          default:
+            assertTrue("Should never get here!", false);
+        }
+      }
+    } catch (NoMoreDataException e) {
+      gotExpectedException = true;
+    }
+    assertTrue("Should have gotten NoMoreDataException!", gotExpectedException);
+    assertEquals("Wrong number of documents created by source!",5,n);
+    assertTrue("Did not see all types!",unseenTypes.isEmpty());
+  }
+
+}