+++ /dev/null
-/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */
-package org.apache.lucene.benchmark.byTask.feeds.demohtml;
-
-import java.io.*;
-import java.util.Locale;
-import java.util.Properties;
-
-public class HTMLParser implements HTMLParserConstants {
- public static int SUMMARY_LENGTH = 200;
-
- StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
- StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
- Properties metaTags=new Properties();
- String currentMetaTag=null;
- String currentMetaContent=null;
- int length = 0;
- boolean titleComplete = false;
- boolean inTitle = false;
- boolean inMetaTag = false;
- boolean inStyle = false;
- boolean afterTag = false;
- boolean afterSpace = false;
- String eol = System.getProperty("line.separator");
- Reader pipeIn = null;
- Writer pipeOut;
- private MyPipedInputStream pipeInStream = null;
- private PipedOutputStream pipeOutStream = null;
-
- private class MyPipedInputStream extends PipedInputStream{
-
- public MyPipedInputStream(){
- super();
- }
-
- public MyPipedInputStream(PipedOutputStream src) throws IOException{
- super(src);
- }
-
- public boolean full() throws IOException{
- return this.available() >= PipedInputStream.PIPE_SIZE;
- }
- }
-
- public String getTitle() throws IOException, InterruptedException {
- if (pipeIn == null)
- getReader(); // spawn parsing thread
- while (true) {
- synchronized(this) {
- if (titleComplete || pipeInStream.full())
- break;
- wait(10);
- }
- }
- return title.toString().trim();
- }
-
- public Properties getMetaTags() throws IOException,
-InterruptedException {
- if (pipeIn == null)
- getReader(); // spawn parsing thread
- while (true) {
- synchronized(this) {
- if (titleComplete || pipeInStream.full())
- break;
- wait(10);
- }
- }
- return metaTags;
- }
-
-
- public String getSummary() throws IOException, InterruptedException {
- if (pipeIn == null)
- getReader(); // spawn parsing thread
- while (true) {
- synchronized(this) {
- if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
- break;
- wait(10);
- }
- }
- if (summary.length() > SUMMARY_LENGTH)
- summary.setLength(SUMMARY_LENGTH);
-
- String sum = summary.toString().trim();
- String tit = getTitle();
- if (sum.equals(""))
- return tit;
- else
- return sum;
- }
-
- public Reader getReader() throws IOException {
- if (pipeIn == null) {
- pipeInStream = new MyPipedInputStream();
- pipeOutStream = new PipedOutputStream(pipeInStream);
- pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
- pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
-
- Thread thread = new ParserThread(this);
- thread.start(); // start parsing
- }
-
- return pipeIn;
- }
-
- void addToSummary(String text) {
- if (summary.length() < SUMMARY_LENGTH) {
- summary.append(text);
- if (summary.length() >= SUMMARY_LENGTH) {
- synchronized(this) {
- notifyAll();
- }
- }
- }
- }
-
- void addText(String text) throws IOException {
- if (inStyle)
- return;
- if (inTitle)
- title.append(text);
- else {
- addToSummary(text);
- if (!titleComplete && !(title.length() == 0)) { // finished title
- synchronized(this) {
- titleComplete = true; // tell waiting threads
- notifyAll();
- }
- }
- }
-
- length += text.length();
- pipeOut.write(text);
-
- afterSpace = false;
- }
-
- void addMetaTag() {
- metaTags.setProperty(currentMetaTag, currentMetaContent);
- currentMetaTag = null;
- currentMetaContent = null;
- return;
- }
-
- void addSpace() throws IOException {
- if (!afterSpace) {
- if (inTitle)
- title.append(" ");
- else
- addToSummary(" ");
-
- String space = afterTag ? eol : " ";
- length += space.length();
- pipeOut.write(space);
- afterSpace = true;
- }
- }
-
- final public void HTMLDocument() throws ParseException, IOException {
- Token t;
- label_1:
- while (true) {
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case ScriptStart:
- case TagName:
- case DeclName:
- case Comment1:
- case Comment2:
- case Word:
- case Entity:
- case Space:
- case Punct:
- ;
- break;
- default:
- jj_la1[0] = jj_gen;
- break label_1;
- }
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case TagName:
- Tag();
- afterTag = true;
- break;
- case DeclName:
- t = Decl();
- afterTag = true;
- break;
- case Comment1:
- case Comment2:
- CommentTag();
- afterTag = true;
- break;
- case ScriptStart:
- ScriptTag();
- afterTag = true;
- break;
- case Word:
- t = jj_consume_token(Word);
- addText(t.image); afterTag = false;
- break;
- case Entity:
- t = jj_consume_token(Entity);
- addText(Entities.decode(t.image)); afterTag = false;
- break;
- case Punct:
- t = jj_consume_token(Punct);
- addText(t.image); afterTag = false;
- break;
- case Space:
- jj_consume_token(Space);
- addSpace(); afterTag = false;
- break;
- default:
- jj_la1[1] = jj_gen;
- jj_consume_token(-1);
- throw new ParseException();
- }
- }
- jj_consume_token(0);
- }
-
- final public void Tag() throws ParseException, IOException {
- Token t1, t2;
- boolean inImg = false;
- t1 = jj_consume_token(TagName);
- String tagName = t1.image.toLowerCase(Locale.ENGLISH);
- if(Tags.WS_ELEMS.contains(tagName) ) {
- addSpace();
- }
- inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
- inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
- inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
- inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
-
- label_2:
- while (true) {
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case ArgName:
- ;
- break;
- default:
- jj_la1[2] = jj_gen;
- break label_2;
- }
- t1 = jj_consume_token(ArgName);
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case ArgEquals:
- jj_consume_token(ArgEquals);
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case ArgValue:
- case ArgQuote1:
- case ArgQuote2:
- t2 = ArgValue();
- if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
- addText("[" + t2.image + "]");
-
- if(inMetaTag &&
- ( t1.image.equalsIgnoreCase("name") ||
- t1.image.equalsIgnoreCase("HTTP-EQUIV")
- )
- && t2 != null)
- {
- currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
- if(currentMetaTag != null && currentMetaContent != null) {
- addMetaTag();
- }
- }
- if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
-null)
- {
- currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
- if(currentMetaTag != null && currentMetaContent != null) {
- addMetaTag();
- }
- }
- break;
- default:
- jj_la1[3] = jj_gen;
- ;
- }
- break;
- default:
- jj_la1[4] = jj_gen;
- ;
- }
- }
- jj_consume_token(TagEnd);
- }
-
- final public Token ArgValue() throws ParseException {
- Token t = null;
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case ArgValue:
- t = jj_consume_token(ArgValue);
- {if (true) return t;}
- break;
- default:
- jj_la1[5] = jj_gen;
- if (jj_2_1(2)) {
- jj_consume_token(ArgQuote1);
- jj_consume_token(CloseQuote1);
- {if (true) return t;}
- } else {
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case ArgQuote1:
- jj_consume_token(ArgQuote1);
- t = jj_consume_token(Quote1Text);
- jj_consume_token(CloseQuote1);
- {if (true) return t;}
- break;
- default:
- jj_la1[6] = jj_gen;
- if (jj_2_2(2)) {
- jj_consume_token(ArgQuote2);
- jj_consume_token(CloseQuote2);
- {if (true) return t;}
- } else {
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case ArgQuote2:
- jj_consume_token(ArgQuote2);
- t = jj_consume_token(Quote2Text);
- jj_consume_token(CloseQuote2);
- {if (true) return t;}
- break;
- default:
- jj_la1[7] = jj_gen;
- jj_consume_token(-1);
- throw new ParseException();
- }
- }
- }
- }
- }
- throw new Error("Missing return statement in function");
- }
-
- final public Token Decl() throws ParseException {
- Token t;
- t = jj_consume_token(DeclName);
- label_3:
- while (true) {
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case ArgName:
- case ArgEquals:
- case ArgValue:
- case ArgQuote1:
- case ArgQuote2:
- ;
- break;
- default:
- jj_la1[8] = jj_gen;
- break label_3;
- }
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case ArgName:
- jj_consume_token(ArgName);
- break;
- case ArgValue:
- case ArgQuote1:
- case ArgQuote2:
- ArgValue();
- break;
- case ArgEquals:
- jj_consume_token(ArgEquals);
- break;
- default:
- jj_la1[9] = jj_gen;
- jj_consume_token(-1);
- throw new ParseException();
- }
- }
- jj_consume_token(TagEnd);
- {if (true) return t;}
- throw new Error("Missing return statement in function");
- }
-
- final public void CommentTag() throws ParseException {
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case Comment1:
- jj_consume_token(Comment1);
- label_4:
- while (true) {
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case CommentText1:
- ;
- break;
- default:
- jj_la1[10] = jj_gen;
- break label_4;
- }
- jj_consume_token(CommentText1);
- }
- jj_consume_token(CommentEnd1);
- break;
- case Comment2:
- jj_consume_token(Comment2);
- label_5:
- while (true) {
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case CommentText2:
- ;
- break;
- default:
- jj_la1[11] = jj_gen;
- break label_5;
- }
- jj_consume_token(CommentText2);
- }
- jj_consume_token(CommentEnd2);
- break;
- default:
- jj_la1[12] = jj_gen;
- jj_consume_token(-1);
- throw new ParseException();
- }
- }
-
- final public void ScriptTag() throws ParseException {
- jj_consume_token(ScriptStart);
- label_6:
- while (true) {
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case ScriptText:
- ;
- break;
- default:
- jj_la1[13] = jj_gen;
- break label_6;
- }
- jj_consume_token(ScriptText);
- }
- jj_consume_token(ScriptEnd);
- }
-
- private boolean jj_2_1(int xla) {
- jj_la = xla; jj_lastpos = jj_scanpos = token;
- try { return !jj_3_1(); }
- catch(LookaheadSuccess ls) { return true; }
- finally { jj_save(0, xla); }
- }
-
- private boolean jj_2_2(int xla) {
- jj_la = xla; jj_lastpos = jj_scanpos = token;
- try { return !jj_3_2(); }
- catch(LookaheadSuccess ls) { return true; }
- finally { jj_save(1, xla); }
- }
-
- private boolean jj_3_2() {
- if (jj_scan_token(ArgQuote2)) return true;
- if (jj_scan_token(CloseQuote2)) return true;
- return false;
- }
-
- private boolean jj_3_1() {
- if (jj_scan_token(ArgQuote1)) return true;
- if (jj_scan_token(CloseQuote1)) return true;
- return false;
- }
-
- /** Generated Token Manager. */
- public HTMLParserTokenManager token_source;
- SimpleCharStream jj_input_stream;
- /** Current token. */
- public Token token;
- /** Next token. */
- public Token jj_nt;
- private int jj_ntk;
- private Token jj_scanpos, jj_lastpos;
- private int jj_la;
- private int jj_gen;
- final private int[] jj_la1 = new int[14];
- static private int[] jj_la1_0;
- static {
- jj_la1_init_0();
- }
- private static void jj_la1_init_0() {
- jj_la1_0 = new int[] {0x2c7e,0x2c7e,0x10000,0x380000,0x20000,0x80000,0x100000,0x200000,0x3b0000,0x3b0000,0x8000000,0x20000000,0x30,0x4000,};
- }
- final private JJCalls[] jj_2_rtns = new JJCalls[2];
- private boolean jj_rescan = false;
- private int jj_gc = 0;
-
- /** Constructor with InputStream. */
- public HTMLParser(java.io.InputStream stream) {
- this(stream, null);
- }
- /** Constructor with InputStream and supplied encoding */
- public HTMLParser(java.io.InputStream stream, String encoding) {
- try { jj_input_stream = new SimpleCharStream(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
- token_source = new HTMLParserTokenManager(jj_input_stream);
- token = new Token();
- jj_ntk = -1;
- jj_gen = 0;
- for (int i = 0; i < 14; i++) jj_la1[i] = -1;
- for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
- }
-
- /** Reinitialise. */
- public void ReInit(java.io.InputStream stream) {
- ReInit(stream, null);
- }
- /** Reinitialise. */
- public void ReInit(java.io.InputStream stream, String encoding) {
- try { jj_input_stream.ReInit(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
- token_source.ReInit(jj_input_stream);
- token = new Token();
- jj_ntk = -1;
- jj_gen = 0;
- for (int i = 0; i < 14; i++) jj_la1[i] = -1;
- for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
- }
-
- /** Constructor. */
- public HTMLParser(java.io.Reader stream) {
- jj_input_stream = new SimpleCharStream(stream, 1, 1);
- token_source = new HTMLParserTokenManager(jj_input_stream);
- token = new Token();
- jj_ntk = -1;
- jj_gen = 0;
- for (int i = 0; i < 14; i++) jj_la1[i] = -1;
- for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
- }
-
- /** Reinitialise. */
- public void ReInit(java.io.Reader stream) {
- jj_input_stream.ReInit(stream, 1, 1);
- token_source.ReInit(jj_input_stream);
- token = new Token();
- jj_ntk = -1;
- jj_gen = 0;
- for (int i = 0; i < 14; i++) jj_la1[i] = -1;
- for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
- }
-
- /** Constructor with generated Token Manager. */
- public HTMLParser(HTMLParserTokenManager tm) {
- token_source = tm;
- token = new Token();
- jj_ntk = -1;
- jj_gen = 0;
- for (int i = 0; i < 14; i++) jj_la1[i] = -1;
- for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
- }
-
- /** Reinitialise. */
- public void ReInit(HTMLParserTokenManager tm) {
- token_source = tm;
- token = new Token();
- jj_ntk = -1;
- jj_gen = 0;
- for (int i = 0; i < 14; i++) jj_la1[i] = -1;
- for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
- }
-
- private Token jj_consume_token(int kind) throws ParseException {
- Token oldToken;
- if ((oldToken = token).next != null) token = token.next;
- else token = token.next = token_source.getNextToken();
- jj_ntk = -1;
- if (token.kind == kind) {
- jj_gen++;
- if (++jj_gc > 100) {
- jj_gc = 0;
- for (int i = 0; i < jj_2_rtns.length; i++) {
- JJCalls c = jj_2_rtns[i];
- while (c != null) {
- if (c.gen < jj_gen) c.first = null;
- c = c.next;
- }
- }
- }
- return token;
- }
- token = oldToken;
- jj_kind = kind;
- throw generateParseException();
- }
-
- static private final class LookaheadSuccess extends java.lang.Error { }
- final private LookaheadSuccess jj_ls = new LookaheadSuccess();
- private boolean jj_scan_token(int kind) {
- if (jj_scanpos == jj_lastpos) {
- jj_la--;
- if (jj_scanpos.next == null) {
- jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken();
- } else {
- jj_lastpos = jj_scanpos = jj_scanpos.next;
- }
- } else {
- jj_scanpos = jj_scanpos.next;
- }
- if (jj_rescan) {
- int i = 0; Token tok = token;
- while (tok != null && tok != jj_scanpos) { i++; tok = tok.next; }
- if (tok != null) jj_add_error_token(kind, i);
- }
- if (jj_scanpos.kind != kind) return true;
- if (jj_la == 0 && jj_scanpos == jj_lastpos) throw jj_ls;
- return false;
- }
-
-
-/** Get the next Token. */
- final public Token getNextToken() {
- if (token.next != null) token = token.next;
- else token = token.next = token_source.getNextToken();
- jj_ntk = -1;
- jj_gen++;
- return token;
- }
-
-/** Get the specific Token. */
- final public Token getToken(int index) {
- Token t = token;
- for (int i = 0; i < index; i++) {
- if (t.next != null) t = t.next;
- else t = t.next = token_source.getNextToken();
- }
- return t;
- }
-
- private int jj_ntk() {
- if ((jj_nt=token.next) == null)
- return (jj_ntk = (token.next=token_source.getNextToken()).kind);
- else
- return (jj_ntk = jj_nt.kind);
- }
-
- private java.util.List<int[]> jj_expentries = new java.util.ArrayList<int[]>();
- private int[] jj_expentry;
- private int jj_kind = -1;
- private int[] jj_lasttokens = new int[100];
- private int jj_endpos;
-
- private void jj_add_error_token(int kind, int pos) {
- if (pos >= 100) return;
- if (pos == jj_endpos + 1) {
- jj_lasttokens[jj_endpos++] = kind;
- } else if (jj_endpos != 0) {
- jj_expentry = new int[jj_endpos];
- for (int i = 0; i < jj_endpos; i++) {
- jj_expentry[i] = jj_lasttokens[i];
- }
- jj_entries_loop: for (java.util.Iterator it = jj_expentries.iterator(); it.hasNext();) {
- int[] oldentry = (int[])(it.next());
- if (oldentry.length == jj_expentry.length) {
- for (int i = 0; i < jj_expentry.length; i++) {
- if (oldentry[i] != jj_expentry[i]) {
- continue jj_entries_loop;
- }
- }
- jj_expentries.add(jj_expentry);
- break jj_entries_loop;
- }
- }
- if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind;
- }
- }
-
- /** Generate ParseException. */
- public ParseException generateParseException() {
- jj_expentries.clear();
- boolean[] la1tokens = new boolean[31];
- if (jj_kind >= 0) {
- la1tokens[jj_kind] = true;
- jj_kind = -1;
- }
- for (int i = 0; i < 14; i++) {
- if (jj_la1[i] == jj_gen) {
- for (int j = 0; j < 32; j++) {
- if ((jj_la1_0[i] & (1<<j)) != 0) {
- la1tokens[j] = true;
- }
- }
- }
- }
- for (int i = 0; i < 31; i++) {
- if (la1tokens[i]) {
- jj_expentry = new int[1];
- jj_expentry[0] = i;
- jj_expentries.add(jj_expentry);
- }
- }
- jj_endpos = 0;
- jj_rescan_token();
- jj_add_error_token(0, 0);
- int[][] exptokseq = new int[jj_expentries.size()][];
- for (int i = 0; i < jj_expentries.size(); i++) {
- exptokseq[i] = jj_expentries.get(i);
- }
- return new ParseException(token, exptokseq, tokenImage);
- }
-
- /** Enable tracing. */
- final public void enable_tracing() {
- }
-
- /** Disable tracing. */
- final public void disable_tracing() {
- }
-
- private void jj_rescan_token() {
- jj_rescan = true;
- for (int i = 0; i < 2; i++) {
- try {
- JJCalls p = jj_2_rtns[i];
- do {
- if (p.gen > jj_gen) {
- jj_la = p.arg; jj_lastpos = jj_scanpos = p.first;
- switch (i) {
- case 0: jj_3_1(); break;
- case 1: jj_3_2(); break;
- }
- }
- p = p.next;
- } while (p != null);
- } catch(LookaheadSuccess ls) { }
- }
- jj_rescan = false;
- }
-
- private void jj_save(int index, int xla) {
- JJCalls p = jj_2_rtns[index];
- while (p.gen > jj_gen) {
- if (p.next == null) { p = p.next = new JJCalls(); break; }
- p = p.next;
- }
- p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla;
- }
-
- static final class JJCalls {
- int gen;
- Token first;
- int arg;
- JJCalls next;
- }
-
-// void handleException(Exception e) {
-// System.out.println(e.toString()); // print the error message
-// System.out.println("Skipping...");
-// Token t;
-// do {
-// t = getNextToken();
-// } while (t.kind != TagEnd);
-// }
-}