--- /dev/null
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// HTMLParser.jj
+
+options {
+ STATIC = false;
+ //DEBUG_LOOKAHEAD = true;
+ //DEBUG_TOKEN_MANAGER = true;
+ UNICODE_INPUT = true;
+}
+
+PARSER_BEGIN(HTMLParser)
+
+package org.apache.lucene.benchmark.byTask.feeds.demohtml;
+
+import java.io.*;
+import java.util.Locale;
+import java.util.Properties;
+
+public class HTMLParser {
+ public static int SUMMARY_LENGTH = 200;
+
+ StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
+ StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
+ Properties metaTags=new Properties();
+ String currentMetaTag=null;
+ String currentMetaContent=null;
+ int length = 0;
+ boolean titleComplete = false;
+ boolean inTitle = false;
+ boolean inMetaTag = false;
+ boolean inStyle = false;
+ boolean afterTag = false;
+ boolean afterSpace = false;
+ String eol = System.getProperty("line.separator");
+ Reader pipeIn = null;
+ Writer pipeOut;
+ private MyPipedInputStream pipeInStream = null;
+ private PipedOutputStream pipeOutStream = null;
+
+ private class MyPipedInputStream extends PipedInputStream{
+
+ public MyPipedInputStream(){
+ super();
+ }
+
+ public MyPipedInputStream(PipedOutputStream src) throws IOException{
+ super(src);
+ }
+
+ public boolean full() throws IOException{
+ return this.available() >= PipedInputStream.PIPE_SIZE;
+ }
+ }
+
+ public String getTitle() throws IOException, InterruptedException {
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (titleComplete || pipeInStream.full())
+ break;
+ wait(10);
+ }
+ }
+ return title.toString().trim();
+ }
+
+ public Properties getMetaTags() throws IOException,
+InterruptedException {
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (titleComplete || pipeInStream.full())
+ break;
+ wait(10);
+ }
+ }
+ return metaTags;
+ }
+
+
+ public String getSummary() throws IOException, InterruptedException {
+ if (pipeIn == null)
+ getReader(); // spawn parsing thread
+ while (true) {
+ synchronized(this) {
+ if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
+ break;
+ wait(10);
+ }
+ }
+ if (summary.length() > SUMMARY_LENGTH)
+ summary.setLength(SUMMARY_LENGTH);
+
+ String sum = summary.toString().trim();
+ String tit = getTitle();
+ if (sum.equals(""))
+ return tit;
+ else
+ return sum;
+ }
+
+ public Reader getReader() throws IOException {
+ if (pipeIn == null) {
+ pipeInStream = new MyPipedInputStream();
+ pipeOutStream = new PipedOutputStream(pipeInStream);
+ pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
+ pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
+
+ Thread thread = new ParserThread(this);
+ thread.start(); // start parsing
+ }
+
+ return pipeIn;
+ }
+
+ void addToSummary(String text) {
+ if (summary.length() < SUMMARY_LENGTH) {
+ summary.append(text);
+ if (summary.length() >= SUMMARY_LENGTH) {
+ synchronized(this) {
+ notifyAll();
+ }
+ }
+ }
+ }
+
+ void addText(String text) throws IOException {
+ if (inStyle)
+ return;
+ if (inTitle)
+ title.append(text);
+ else {
+ addToSummary(text);
+ if (!titleComplete && !(title.length() == 0)) { // finished title
+ synchronized(this) {
+ titleComplete = true; // tell waiting threads
+ notifyAll();
+ }
+ }
+ }
+
+ length += text.length();
+ pipeOut.write(text);
+
+ afterSpace = false;
+ }
+
+ void addMetaTag() {
+ metaTags.setProperty(currentMetaTag, currentMetaContent);
+ currentMetaTag = null;
+ currentMetaContent = null;
+ return;
+ }
+
+ void addSpace() throws IOException {
+ if (!afterSpace) {
+ if (inTitle)
+ title.append(" ");
+ else
+ addToSummary(" ");
+
+ String space = afterTag ? eol : " ";
+ length += space.length();
+ pipeOut.write(space);
+ afterSpace = true;
+ }
+ }
+
+// void handleException(Exception e) {
+// System.out.println(e.toString()); // print the error message
+// System.out.println("Skipping...");
+// Token t;
+// do {
+// t = getNextToken();
+// } while (t.kind != TagEnd);
+// }
+}
+
+PARSER_END(HTMLParser)
+
+
+void HTMLDocument() throws IOException :
+{
+ Token t;
+}
+{
+// try {
+ ( Tag() { afterTag = true; }
+ | t=Decl() { afterTag = true; }
+ | CommentTag() { afterTag = true; }
+ | ScriptTag() { afterTag = true; }
+ | t=<Word> { addText(t.image); afterTag = false; }
+ | t=<Entity> { addText(Entities.decode(t.image)); afterTag = false; }
+ | t=<Punct> { addText(t.image); afterTag = false; }
+ | <Space> { addSpace(); afterTag = false; }
+ )* <EOF>
+// } catch (ParseException e) {
+// handleException(e);
+// }
+}
+
+void Tag() throws IOException :
+{
+ Token t1, t2;
+ boolean inImg = false;
+}
+{
+ t1=<TagName> {
+ String tagName = t1.image.toLowerCase(Locale.ENGLISH);
+ if(Tags.WS_ELEMS.contains(tagName) ) {
+ addSpace();
+ }
+ inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
+ inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
+ inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
+ inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
+ }
+ (t1=<ArgName>
+ (<ArgEquals>
+ (t2=ArgValue() // save ALT text in IMG tag
+ {
+ if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
+ addText("[" + t2.image + "]");
+
+ if(inMetaTag &&
+ ( t1.image.equalsIgnoreCase("name") ||
+ t1.image.equalsIgnoreCase("HTTP-EQUIV")
+ )
+ && t2 != null)
+ {
+ currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
+ if(currentMetaTag != null && currentMetaContent != null) {
+ addMetaTag();
+ }
+ }
+ if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
+null)
+ {
+ currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
+ if(currentMetaTag != null && currentMetaContent != null) {
+ addMetaTag();
+ }
+ }
+ }
+ )?
+ )?
+ )*
+ <TagEnd>
+}
+
+Token ArgValue() :
+{
+ Token t = null;
+}
+{
+ t=<ArgValue> { return t; }
+| LOOKAHEAD(2)
+ <ArgQuote1> <CloseQuote1> { return t; }
+| <ArgQuote1> t=<Quote1Text> <CloseQuote1> { return t; }
+| LOOKAHEAD(2)
+ <ArgQuote2> <CloseQuote2> { return t; }
+| <ArgQuote2> t=<Quote2Text> <CloseQuote2> { return t; }
+}
+
+
+Token Decl() :
+{
+ Token t;
+}
+{
+ t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd>
+ { return t; }
+}
+
+
+void CommentTag() :
+{}
+{
+ (<Comment1> ( <CommentText1> )* <CommentEnd1>)
+ |
+ (<Comment2> ( <CommentText2> )* <CommentEnd2>)
+}
+
+void ScriptTag() :
+{}
+{
+ <ScriptStart> ( <ScriptText> )* <ScriptEnd>
+}
+
+
+TOKEN :
+{
+ < ScriptStart: "<script" > : WithinScript
+| < TagName: "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
+| < DeclName: "<" "!" ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
+
+| < Comment1: "<!--" > : WithinComment1
+| < Comment2: "<!" > : WithinComment2
+
+| < Word: ( <LET> | <LET> (["+","/"])+ | <NUM> ["\""] |
+ <LET> ["-","'"] <LET> | ("$")? <NUM> [",","."] <NUM> )+ >
+| < #LET: ["A"-"Z","a"-"z","0"-"9"] >
+| < #NUM: ["0"-"9"] >
+| < #HEX: ["0"-"9","A"-"F","a"-"f"] >
+
+| < Entity: ( "&" (["A"-"Z","a"-"z"])+ (";")? | "&" "#" (<NUM>)+ (";")? | "&" "#" ["X","x"] (<HEX>)+ (";")? ) >
+
+| < Space: (<SP>)+ >
+| < #SP: [" ","\t","\r","\n"] >
+
+| < Punct: ~[] > // Keep this last. It is a catch-all.
+}
+
+<WithinScript> TOKEN:
+{
+ < ScriptText: (~["<",">"])+ | "<" | ">" >
+| < ScriptEnd: "</script" (~["<",">"])* ">" > : DEFAULT
+}
+
+<WithinTag> TOKEN:
+{
+ < ArgName: (~[" ","\t","\r","\n","=",">","'","\""])
+ (~[" ","\t","\r","\n","=",">"])* >
+| < ArgEquals: "=" > : AfterEquals
+| < TagEnd: ">" | "=>" > : DEFAULT
+}
+
+<AfterEquals> TOKEN:
+{
+ < ArgValue: (~[" ","\t","\r","\n","=",">","'","\""])
+ (~[" ","\t","\r","\n",">"])* > : WithinTag
+}
+
+<WithinTag, AfterEquals> TOKEN:
+{
+ < ArgQuote1: "'" > : WithinQuote1
+| < ArgQuote2: "\"" > : WithinQuote2
+}
+
+<WithinTag, AfterEquals> SKIP:
+{
+ < <Space> >
+}
+
+<WithinQuote1> TOKEN:
+{
+ < Quote1Text: (~["'"])+ >
+| < CloseQuote1: <ArgQuote1> > : WithinTag
+}
+
+<WithinQuote2> TOKEN:
+{
+ < Quote2Text: (~["\""])+ >
+| < CloseQuote2: <ArgQuote2> > : WithinTag
+}
+
+
+<WithinComment1> TOKEN :
+{
+ < CommentText1: (~["-"])+ | "-" >
+| < CommentEnd1: "-->" > : DEFAULT
+}
+
+<WithinComment2> TOKEN :
+{
+ < CommentText2: (~[">"])+ >
+| < CommentEnd2: ">" > : DEFAULT
+}