1 package org.apache.lucene.benchmark.byTask.feeds.demohtml;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.Reader;
22 import java.io.StringReader;
23 import java.util.Properties;
25 import org.apache.lucene.util.LuceneTestCase;
27 public class TestHtmlParser extends LuceneTestCase {
29 public void testUnicode() throws Exception {
30 String text = "<html><body>汉语</body></html>";
31 HTMLParser parser = new HTMLParser(new StringReader(text));
32 assertReadsTo("汉语", parser);
35 public void testEntities() throws Exception {
36 String text = "<html><body>汉语¥</body></html>";
37 HTMLParser parser = new HTMLParser(new StringReader(text));
38 assertReadsTo("汉语¥", parser);
41 public void testComments() throws Exception {
42 String text = "<html><body>foo<!-- bar --><! baz --></body></html>";
43 HTMLParser parser = new HTMLParser(new StringReader(text));
44 assertReadsTo("foo", parser);
47 public void testScript() throws Exception {
48 String text = "<html><body><script type=\"text/javascript\">" +
49 "document.write(\"test\")</script>foo</body></html>";
50 HTMLParser parser = new HTMLParser(new StringReader(text));
51 assertReadsTo("foo", parser);
54 public void testStyle() throws Exception {
55 String text = "<html><head><style type=\"text/css\">" +
56 "body{background-color:blue;}</style>" +
57 "</head><body>foo</body></html>";
58 HTMLParser parser = new HTMLParser(new StringReader(text));
59 assertReadsTo("foo", parser);
62 public void testDoctype() throws Exception {
63 String text = "<!DOCTYPE HTML PUBLIC " +
64 "\"-//W3C//DTD HTML 4.01 Transitional//EN\"" +
65 "\"http://www.w3.org/TR/html4/loose.dtd\">" +
66 "<html><body>foo</body></html>";
67 HTMLParser parser = new HTMLParser(new StringReader(text));
68 assertReadsTo("foo", parser);
71 public void testMeta() throws Exception {
72 String text = "<html><head>" +
73 "<meta name=\"a\" content=\"1\" />" +
74 "<meta name=\"b\" content=\"2\" />" +
75 "<meta name=\"keywords\" content=\"this is a test\" />" +
76 "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\" />" +
77 "</head><body>foobar</body></html>";
78 HTMLParser parser = new HTMLParser(new StringReader(text));
79 Properties tags = parser.getMetaTags();
80 assertEquals(4, tags.size());
81 assertEquals("1", tags.get("a"));
82 assertEquals("2", tags.get("b"));
83 assertEquals("this is a test", tags.get("keywords"));
84 assertEquals("text/html;charset=utf-8", tags.get("content-type"));
87 public void testTitle() throws Exception {
88 String text = "<html><head><TITLE>foo</TITLE><head><body>bar</body></html>";
89 HTMLParser parser = new HTMLParser(new StringReader(text));
90 assertEquals("foo", parser.getTitle());
93 public void testSummary() throws Exception {
94 String text = "<html><head><TITLE>foo</TITLE><head><body>" +
95 "Summarize me. Summarize me. Summarize me. Summarize me. " +
96 "Summarize me. Summarize me. Summarize me. Summarize me. " +
97 "Summarize me. Summarize me. Summarize me. Summarize me. " +
98 "Summarize me. Summarize me. Summarize me. Summarize me. " +
99 "Summarize me. Summarize me. Summarize me. Summarize me. " +
100 "Summarize me. Summarize me. Summarize me. Summarize me. " +
101 "Summarize me. Summarize me. Summarize me. Summarize me. " +
103 HTMLParser parser = new HTMLParser(new StringReader(text));
104 assertEquals(200, parser.getSummary().length());
108 public void testSummaryTitle() throws Exception {
109 String text = "<html><head><title>Summary</title></head><body>Summary of the document</body></html>";
110 HTMLParser parser = new HTMLParser(new StringReader(text));
111 assertEquals("Summary of the document", parser.getSummary());
115 public void testTurkish() throws Exception {
116 String text = "<html><body>" +
117 "<IMG SRC=\"../images/head.jpg\" WIDTH=570 HEIGHT=47 BORDER=0 ALT=\"ş\">" +
118 "<a title=\"(ııı)\"></body></html>";
119 HTMLParser parser = new HTMLParser(new StringReader(text));
120 assertReadsTo("[ş]", parser);
123 private void assertReadsTo(String expected, HTMLParser parser) throws IOException {
124 Reader reader = parser.getReader();
125 StringBuilder builder = new StringBuilder();
127 while ((ch = reader.read()) != -1) {
128 builder.append((char)ch);
130 assertEquals(expected, builder.toString());