--- /dev/null
+package org.apache.lucene.benchmark.byTask.feeds.demohtml;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Properties;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestHtmlParser extends LuceneTestCase {
+
+ public void testUnicode() throws Exception {
+ String text = "<html><body>汉语</body></html>";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertReadsTo("汉语", parser);
+ }
+
+ public void testEntities() throws Exception {
+ String text = "<html><body>汉语¥</body></html>";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertReadsTo("汉语¥", parser);
+ }
+
+ public void testComments() throws Exception {
+ String text = "<html><body>foo<!-- bar --><! baz --></body></html>";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertReadsTo("foo", parser);
+ }
+
+ public void testScript() throws Exception {
+ String text = "<html><body><script type=\"text/javascript\">" +
+ "document.write(\"test\")</script>foo</body></html>";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertReadsTo("foo", parser);
+ }
+
+ public void testStyle() throws Exception {
+ String text = "<html><head><style type=\"text/css\">" +
+ "body{background-color:blue;}</style>" +
+ "</head><body>foo</body></html>";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertReadsTo("foo", parser);
+ }
+
+ public void testDoctype() throws Exception {
+ String text = "<!DOCTYPE HTML PUBLIC " +
+ "\"-//W3C//DTD HTML 4.01 Transitional//EN\"" +
+ "\"http://www.w3.org/TR/html4/loose.dtd\">" +
+ "<html><body>foo</body></html>";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertReadsTo("foo", parser);
+ }
+
+ public void testMeta() throws Exception {
+ String text = "<html><head>" +
+ "<meta name=\"a\" content=\"1\" />" +
+ "<meta name=\"b\" content=\"2\" />" +
+ "<meta name=\"keywords\" content=\"this is a test\" />" +
+ "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\" />" +
+ "</head><body>foobar</body></html>";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ Properties tags = parser.getMetaTags();
+ assertEquals(4, tags.size());
+ assertEquals("1", tags.get("a"));
+ assertEquals("2", tags.get("b"));
+ assertEquals("this is a test", tags.get("keywords"));
+ assertEquals("text/html;charset=utf-8", tags.get("content-type"));
+ }
+
+ public void testTitle() throws Exception {
+ String text = "<html><head><TITLE>foo</TITLE><head><body>bar</body></html>";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertEquals("foo", parser.getTitle());
+ }
+
+ public void testSummary() throws Exception {
+ String text = "<html><head><TITLE>foo</TITLE><head><body>" +
+ "Summarize me. Summarize me. Summarize me. Summarize me. " +
+ "Summarize me. Summarize me. Summarize me. Summarize me. " +
+ "Summarize me. Summarize me. Summarize me. Summarize me. " +
+ "Summarize me. Summarize me. Summarize me. Summarize me. " +
+ "Summarize me. Summarize me. Summarize me. Summarize me. " +
+ "Summarize me. Summarize me. Summarize me. Summarize me. " +
+ "Summarize me. Summarize me. Summarize me. Summarize me. " +
+ "</body></html>";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertEquals(200, parser.getSummary().length());
+ }
+
+ // LUCENE-590
+ public void testSummaryTitle() throws Exception {
+ String text = "<html><head><title>Summary</title></head><body>Summary of the document</body></html>";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertEquals("Summary of the document", parser.getSummary());
+ }
+
+ // LUCENE-2246
+ public void testTurkish() throws Exception {
+ String text = "<html><body>" +
+ "<IMG SRC=\"../images/head.jpg\" WIDTH=570 HEIGHT=47 BORDER=0 ALT=\"ş\">" +
+ "<a title=\"(ııı)\"></body></html>";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertReadsTo("[ş]", parser);
+ }
+
+ private void assertReadsTo(String expected, HTMLParser parser) throws IOException {
+ Reader reader = parser.getReader();
+ StringBuilder builder = new StringBuilder();
+ int ch = 0;
+ while ((ch = reader.read()) != -1) {
+ builder.append((char)ch);
+ }
+ assertEquals(expected, builder.toString());
+ }
+}