2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
22 //DEBUG_LOOKAHEAD = true;
23 //DEBUG_TOKEN_MANAGER = true;
27 PARSER_BEGIN(HTMLParser)
29 package org.apache.lucene.benchmark.byTask.feeds.demohtml;
32 import java.util.Locale;
33 import java.util.Properties;
35 public class HTMLParser {
36 public static int SUMMARY_LENGTH = 200;
38 StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
39 StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
40 Properties metaTags=new Properties();
41 String currentMetaTag=null;
42 String currentMetaContent=null;
44 boolean titleComplete = false;
45 boolean inTitle = false;
46 boolean inMetaTag = false;
47 boolean inStyle = false;
48 boolean afterTag = false;
49 boolean afterSpace = false;
50 String eol = System.getProperty("line.separator");
53 private MyPipedInputStream pipeInStream = null;
54 private PipedOutputStream pipeOutStream = null;
56 private class MyPipedInputStream extends PipedInputStream{
58 public MyPipedInputStream(){
62 public MyPipedInputStream(PipedOutputStream src) throws IOException{
66 public boolean full() throws IOException{
67 return this.available() >= PipedInputStream.PIPE_SIZE;
71 public String getTitle() throws IOException, InterruptedException {
73 getReader(); // spawn parsing thread
76 if (titleComplete || pipeInStream.full())
81 return title.toString().trim();
84 public Properties getMetaTags() throws IOException,
85 InterruptedException {
87 getReader(); // spawn parsing thread
90 if (titleComplete || pipeInStream.full())
99 public String getSummary() throws IOException, InterruptedException {
101 getReader(); // spawn parsing thread
104 if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
109 if (summary.length() > SUMMARY_LENGTH)
110 summary.setLength(SUMMARY_LENGTH);
112 String sum = summary.toString().trim();
113 String tit = getTitle();
120 public Reader getReader() throws IOException {
121 if (pipeIn == null) {
122 pipeInStream = new MyPipedInputStream();
123 pipeOutStream = new PipedOutputStream(pipeInStream);
124 pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
125 pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
127 Thread thread = new ParserThread(this);
128 thread.start(); // start parsing
134 void addToSummary(String text) {
135 if (summary.length() < SUMMARY_LENGTH) {
136 summary.append(text);
137 if (summary.length() >= SUMMARY_LENGTH) {
145 void addText(String text) throws IOException {
152 if (!titleComplete && !(title.length() == 0)) { // finished title
154 titleComplete = true; // tell waiting threads
160 length += text.length();
167 metaTags.setProperty(currentMetaTag, currentMetaContent);
168 currentMetaTag = null;
169 currentMetaContent = null;
173 void addSpace() throws IOException {
180 String space = afterTag ? eol : " ";
181 length += space.length();
182 pipeOut.write(space);
187 // void handleException(Exception e) {
188 // System.out.println(e.toString()); // print the error message
189 // System.out.println("Skipping...");
192 // t = getNextToken();
193 // } while (t.kind != TagEnd);
197 PARSER_END(HTMLParser)
200 void HTMLDocument() throws IOException :
206 ( Tag() { afterTag = true; }
207 | t=Decl() { afterTag = true; }
208 | CommentTag() { afterTag = true; }
209 | ScriptTag() { afterTag = true; }
210 | t=<Word> { addText(t.image); afterTag = false; }
211 | t=<Entity> { addText(Entities.decode(t.image)); afterTag = false; }
212 | t=<Punct> { addText(t.image); afterTag = false; }
213 | <Space> { addSpace(); afterTag = false; }
215 // } catch (ParseException e) {
216 // handleException(e);
220 void Tag() throws IOException :
223 boolean inImg = false;
227 String tagName = t1.image.toLowerCase(Locale.ENGLISH);
228 if(Tags.WS_ELEMS.contains(tagName) ) {
231 inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
232 inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
233 inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
234 inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
238 (t2=ArgValue() // save ALT text in IMG tag
240 if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
241 addText("[" + t2.image + "]");
244 ( t1.image.equalsIgnoreCase("name") ||
245 t1.image.equalsIgnoreCase("HTTP-EQUIV")
249 currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
250 if(currentMetaTag != null && currentMetaContent != null) {
254 if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
257 currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
258 if(currentMetaTag != null && currentMetaContent != null) {
274 t=<ArgValue> { return t; }
276 <ArgQuote1> <CloseQuote1> { return t; }
277 | <ArgQuote1> t=<Quote1Text> <CloseQuote1> { return t; }
279 <ArgQuote2> <CloseQuote2> { return t; }
280 | <ArgQuote2> t=<Quote2Text> <CloseQuote2> { return t; }
289 t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd>
297 (<Comment1> ( <CommentText1> )* <CommentEnd1>)
299 (<Comment2> ( <CommentText2> )* <CommentEnd2>)
305 <ScriptStart> ( <ScriptText> )* <ScriptEnd>
311 < ScriptStart: "<script" > : WithinScript
312 | < TagName: "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
313 | < DeclName: "<" "!" ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
315 | < Comment1: "<!--" > : WithinComment1
316 | < Comment2: "<!" > : WithinComment2
318 | < Word: ( <LET> | <LET> (["+","/"])+ | <NUM> ["\""] |
319 <LET> ["-","'"] <LET> | ("$")? <NUM> [",","."] <NUM> )+ >
320 | < #LET: ["A"-"Z","a"-"z","0"-"9"] >
321 | < #NUM: ["0"-"9"] >
322 | < #HEX: ["0"-"9","A"-"F","a"-"f"] >
324 | < Entity: ( "&" (["A"-"Z","a"-"z"])+ (";")? | "&" "#" (<NUM>)+ (";")? | "&" "#" ["X","x"] (<HEX>)+ (";")? ) >
327 | < #SP: [" ","\t","\r","\n"] >
329 | < Punct: ~[] > // Keep this last. It is a catch-all.
332 <WithinScript> TOKEN:
334 < ScriptText: (~["<",">"])+ | "<" | ">" >
335 | < ScriptEnd: "</script" (~["<",">"])* ">" > : DEFAULT
340 < ArgName: (~[" ","\t","\r","\n","=",">","'","\""])
341 (~[" ","\t","\r","\n","=",">"])* >
342 | < ArgEquals: "=" > : AfterEquals
343 | < TagEnd: ">" | "=>" > : DEFAULT
348 < ArgValue: (~[" ","\t","\r","\n","=",">","'","\""])
349 (~[" ","\t","\r","\n",">"])* > : WithinTag
352 <WithinTag, AfterEquals> TOKEN:
354 < ArgQuote1: "'" > : WithinQuote1
355 | < ArgQuote2: "\"" > : WithinQuote2
358 <WithinTag, AfterEquals> SKIP:
363 <WithinQuote1> TOKEN:
365 < Quote1Text: (~["'"])+ >
366 | < CloseQuote1: <ArgQuote1> > : WithinTag
369 <WithinQuote2> TOKEN:
371 < Quote2Text: (~["\""])+ >
372 | < CloseQuote2: <ArgQuote2> > : WithinTag
376 <WithinComment1> TOKEN :
378 < CommentText1: (~["-"])+ | "-" >
379 | < CommentEnd1: "-->" > : DEFAULT
382 <WithinComment2> TOKEN :
384 < CommentText2: (~[">"])+ >
385 | < CommentEnd2: ">" > : DEFAULT