1 /* Generated By:JavaCC: Do not edit this line. HTMLParser.java */
2 package org.apache.lucene.benchmark.byTask.feeds.demohtml;
5 import java.util.Locale;
6 import java.util.Properties;
8 public class HTMLParser implements HTMLParserConstants {
9 public static int SUMMARY_LENGTH = 200;
11 StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
12 StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
13 Properties metaTags=new Properties();
14 String currentMetaTag=null;
15 String currentMetaContent=null;
17 boolean titleComplete = false;
18 boolean inTitle = false;
19 boolean inMetaTag = false;
20 boolean inStyle = false;
21 boolean afterTag = false;
22 boolean afterSpace = false;
23 String eol = System.getProperty("line.separator");
26 private MyPipedInputStream pipeInStream = null;
27 private PipedOutputStream pipeOutStream = null;
29 private class MyPipedInputStream extends PipedInputStream{
31 public MyPipedInputStream(){
35 public MyPipedInputStream(PipedOutputStream src) throws IOException{
39 public boolean full() throws IOException{
40 return this.available() >= PipedInputStream.PIPE_SIZE;
44 public String getTitle() throws IOException, InterruptedException {
46 getReader(); // spawn parsing thread
49 if (titleComplete || pipeInStream.full())
54 return title.toString().trim();
57 public Properties getMetaTags() throws IOException,
58 InterruptedException {
60 getReader(); // spawn parsing thread
63 if (titleComplete || pipeInStream.full())
72 public String getSummary() throws IOException, InterruptedException {
74 getReader(); // spawn parsing thread
77 if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
82 if (summary.length() > SUMMARY_LENGTH)
83 summary.setLength(SUMMARY_LENGTH);
85 String sum = summary.toString().trim();
86 String tit = getTitle();
93 public Reader getReader() throws IOException {
95 pipeInStream = new MyPipedInputStream();
96 pipeOutStream = new PipedOutputStream(pipeInStream);
97 pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
98 pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
100 Thread thread = new ParserThread(this);
101 thread.start(); // start parsing
107 void addToSummary(String text) {
108 if (summary.length() < SUMMARY_LENGTH) {
109 summary.append(text);
110 if (summary.length() >= SUMMARY_LENGTH) {
118 void addText(String text) throws IOException {
125 if (!titleComplete && !(title.length() == 0)) { // finished title
127 titleComplete = true; // tell waiting threads
133 length += text.length();
140 metaTags.setProperty(currentMetaTag, currentMetaContent);
141 currentMetaTag = null;
142 currentMetaContent = null;
146 void addSpace() throws IOException {
153 String space = afterTag ? eol : " ";
154 length += space.length();
155 pipeOut.write(space);
160 final public void HTMLDocument() throws ParseException, IOException {
164 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
180 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
199 t = jj_consume_token(Word);
200 addText(t.image); afterTag = false;
203 t = jj_consume_token(Entity);
204 addText(Entities.decode(t.image)); afterTag = false;
207 t = jj_consume_token(Punct);
208 addText(t.image); afterTag = false;
211 jj_consume_token(Space);
212 addSpace(); afterTag = false;
216 jj_consume_token(-1);
217 throw new ParseException();
223 final public void Tag() throws ParseException, IOException {
225 boolean inImg = false;
226 t1 = jj_consume_token(TagName);
227 String tagName = t1.image.toLowerCase(Locale.ENGLISH);
228 if(Tags.WS_ELEMS.contains(tagName) ) {
231 inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
232 inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
233 inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
234 inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
238 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
246 t1 = jj_consume_token(ArgName);
247 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
249 jj_consume_token(ArgEquals);
250 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
255 if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
256 addText("[" + t2.image + "]");
259 ( t1.image.equalsIgnoreCase("name") ||
260 t1.image.equalsIgnoreCase("HTTP-EQUIV")
264 currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
265 if(currentMetaTag != null && currentMetaContent != null) {
269 if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
272 currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
273 if(currentMetaTag != null && currentMetaContent != null) {
288 jj_consume_token(TagEnd);
291 final public Token ArgValue() throws ParseException {
293 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
295 t = jj_consume_token(ArgValue);
296 {if (true) return t;}
301 jj_consume_token(ArgQuote1);
302 jj_consume_token(CloseQuote1);
303 {if (true) return t;}
305 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
307 jj_consume_token(ArgQuote1);
308 t = jj_consume_token(Quote1Text);
309 jj_consume_token(CloseQuote1);
310 {if (true) return t;}
315 jj_consume_token(ArgQuote2);
316 jj_consume_token(CloseQuote2);
317 {if (true) return t;}
319 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
321 jj_consume_token(ArgQuote2);
322 t = jj_consume_token(Quote2Text);
323 jj_consume_token(CloseQuote2);
324 {if (true) return t;}
328 jj_consume_token(-1);
329 throw new ParseException();
335 throw new Error("Missing return statement in function");
338 final public Token Decl() throws ParseException {
340 t = jj_consume_token(DeclName);
343 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
355 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
357 jj_consume_token(ArgName);
365 jj_consume_token(ArgEquals);
369 jj_consume_token(-1);
370 throw new ParseException();
373 jj_consume_token(TagEnd);
374 {if (true) return t;}
375 throw new Error("Missing return statement in function");
378 final public void CommentTag() throws ParseException {
379 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
381 jj_consume_token(Comment1);
384 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
392 jj_consume_token(CommentText1);
394 jj_consume_token(CommentEnd1);
397 jj_consume_token(Comment2);
400 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
408 jj_consume_token(CommentText2);
410 jj_consume_token(CommentEnd2);
414 jj_consume_token(-1);
415 throw new ParseException();
419 final public void ScriptTag() throws ParseException {
420 jj_consume_token(ScriptStart);
423 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
431 jj_consume_token(ScriptText);
433 jj_consume_token(ScriptEnd);
436 private boolean jj_2_1(int xla) {
437 jj_la = xla; jj_lastpos = jj_scanpos = token;
438 try { return !jj_3_1(); }
439 catch(LookaheadSuccess ls) { return true; }
440 finally { jj_save(0, xla); }
443 private boolean jj_2_2(int xla) {
444 jj_la = xla; jj_lastpos = jj_scanpos = token;
445 try { return !jj_3_2(); }
446 catch(LookaheadSuccess ls) { return true; }
447 finally { jj_save(1, xla); }
450 private boolean jj_3_2() {
451 if (jj_scan_token(ArgQuote2)) return true;
452 if (jj_scan_token(CloseQuote2)) return true;
456 private boolean jj_3_1() {
457 if (jj_scan_token(ArgQuote1)) return true;
458 if (jj_scan_token(CloseQuote1)) return true;
462 /** Generated Token Manager. */
463 public HTMLParserTokenManager token_source;
464 SimpleCharStream jj_input_stream;
465 /** Current token. */
470 private Token jj_scanpos, jj_lastpos;
473 final private int[] jj_la1 = new int[14];
474 static private int[] jj_la1_0;
478 private static void jj_la1_init_0() {
479 jj_la1_0 = new int[] {0x2c7e,0x2c7e,0x10000,0x380000,0x20000,0x80000,0x100000,0x200000,0x3b0000,0x3b0000,0x8000000,0x20000000,0x30,0x4000,};
481 final private JJCalls[] jj_2_rtns = new JJCalls[2];
482 private boolean jj_rescan = false;
483 private int jj_gc = 0;
485 /** Constructor with InputStream. */
486 public HTMLParser(java.io.InputStream stream) {
489 /** Constructor with InputStream and supplied encoding */
490 public HTMLParser(java.io.InputStream stream, String encoding) {
491 try { jj_input_stream = new SimpleCharStream(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
492 token_source = new HTMLParserTokenManager(jj_input_stream);
496 for (int i = 0; i < 14; i++) jj_la1[i] = -1;
497 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
501 public void ReInit(java.io.InputStream stream) {
502 ReInit(stream, null);
505 public void ReInit(java.io.InputStream stream, String encoding) {
506 try { jj_input_stream.ReInit(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
507 token_source.ReInit(jj_input_stream);
511 for (int i = 0; i < 14; i++) jj_la1[i] = -1;
512 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
516 public HTMLParser(java.io.Reader stream) {
517 jj_input_stream = new SimpleCharStream(stream, 1, 1);
518 token_source = new HTMLParserTokenManager(jj_input_stream);
522 for (int i = 0; i < 14; i++) jj_la1[i] = -1;
523 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
527 public void ReInit(java.io.Reader stream) {
528 jj_input_stream.ReInit(stream, 1, 1);
529 token_source.ReInit(jj_input_stream);
533 for (int i = 0; i < 14; i++) jj_la1[i] = -1;
534 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
537 /** Constructor with generated Token Manager. */
538 public HTMLParser(HTMLParserTokenManager tm) {
543 for (int i = 0; i < 14; i++) jj_la1[i] = -1;
544 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
548 public void ReInit(HTMLParserTokenManager tm) {
553 for (int i = 0; i < 14; i++) jj_la1[i] = -1;
554 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
557 private Token jj_consume_token(int kind) throws ParseException {
559 if ((oldToken = token).next != null) token = token.next;
560 else token = token.next = token_source.getNextToken();
562 if (token.kind == kind) {
566 for (int i = 0; i < jj_2_rtns.length; i++) {
567 JJCalls c = jj_2_rtns[i];
569 if (c.gen < jj_gen) c.first = null;
578 throw generateParseException();
581 static private final class LookaheadSuccess extends java.lang.Error { }
582 final private LookaheadSuccess jj_ls = new LookaheadSuccess();
583 private boolean jj_scan_token(int kind) {
584 if (jj_scanpos == jj_lastpos) {
586 if (jj_scanpos.next == null) {
587 jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken();
589 jj_lastpos = jj_scanpos = jj_scanpos.next;
592 jj_scanpos = jj_scanpos.next;
595 int i = 0; Token tok = token;
596 while (tok != null && tok != jj_scanpos) { i++; tok = tok.next; }
597 if (tok != null) jj_add_error_token(kind, i);
599 if (jj_scanpos.kind != kind) return true;
600 if (jj_la == 0 && jj_scanpos == jj_lastpos) throw jj_ls;
605 /** Get the next Token. */
606 final public Token getNextToken() {
607 if (token.next != null) token = token.next;
608 else token = token.next = token_source.getNextToken();
614 /** Get the specific Token. */
615 final public Token getToken(int index) {
617 for (int i = 0; i < index; i++) {
618 if (t.next != null) t = t.next;
619 else t = t.next = token_source.getNextToken();
624 private int jj_ntk() {
625 if ((jj_nt=token.next) == null)
626 return (jj_ntk = (token.next=token_source.getNextToken()).kind);
628 return (jj_ntk = jj_nt.kind);
631 private java.util.List<int[]> jj_expentries = new java.util.ArrayList<int[]>();
632 private int[] jj_expentry;
633 private int jj_kind = -1;
634 private int[] jj_lasttokens = new int[100];
635 private int jj_endpos;
637 private void jj_add_error_token(int kind, int pos) {
638 if (pos >= 100) return;
639 if (pos == jj_endpos + 1) {
640 jj_lasttokens[jj_endpos++] = kind;
641 } else if (jj_endpos != 0) {
642 jj_expentry = new int[jj_endpos];
643 for (int i = 0; i < jj_endpos; i++) {
644 jj_expentry[i] = jj_lasttokens[i];
646 jj_entries_loop: for (java.util.Iterator it = jj_expentries.iterator(); it.hasNext();) {
647 int[] oldentry = (int[])(it.next());
648 if (oldentry.length == jj_expentry.length) {
649 for (int i = 0; i < jj_expentry.length; i++) {
650 if (oldentry[i] != jj_expentry[i]) {
651 continue jj_entries_loop;
654 jj_expentries.add(jj_expentry);
655 break jj_entries_loop;
658 if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind;
662 /** Generate ParseException. */
663 public ParseException generateParseException() {
664 jj_expentries.clear();
665 boolean[] la1tokens = new boolean[31];
667 la1tokens[jj_kind] = true;
670 for (int i = 0; i < 14; i++) {
671 if (jj_la1[i] == jj_gen) {
672 for (int j = 0; j < 32; j++) {
673 if ((jj_la1_0[i] & (1<<j)) != 0) {
679 for (int i = 0; i < 31; i++) {
681 jj_expentry = new int[1];
683 jj_expentries.add(jj_expentry);
688 jj_add_error_token(0, 0);
689 int[][] exptokseq = new int[jj_expentries.size()][];
690 for (int i = 0; i < jj_expentries.size(); i++) {
691 exptokseq[i] = jj_expentries.get(i);
693 return new ParseException(token, exptokseq, tokenImage);
696 /** Enable tracing. */
697 final public void enable_tracing() {
700 /** Disable tracing. */
701 final public void disable_tracing() {
704 private void jj_rescan_token() {
706 for (int i = 0; i < 2; i++) {
708 JJCalls p = jj_2_rtns[i];
710 if (p.gen > jj_gen) {
711 jj_la = p.arg; jj_lastpos = jj_scanpos = p.first;
713 case 0: jj_3_1(); break;
714 case 1: jj_3_2(); break;
719 } catch(LookaheadSuccess ls) { }
724 private void jj_save(int index, int xla) {
725 JJCalls p = jj_2_rtns[index];
726 while (p.gen > jj_gen) {
727 if (p.next == null) { p = p.next = new JJCalls(); break; }
730 p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla;
733 static final class JJCalls {
740 // void handleException(Exception e) {
741 // System.out.println(e.toString()); // print the error message
742 // System.out.println("Skipping...");
745 // t = getNextToken();
746 // } while (t.kind != TagEnd);