lucene-java-3.4.0/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 // HTMLParser.jj
  19
  20 options {
  21   STATIC = false;
  22   //DEBUG_LOOKAHEAD = true;
  23   //DEBUG_TOKEN_MANAGER = true;
  24   UNICODE_INPUT = true;
  25 }
  26
  27 PARSER_BEGIN(HTMLParser)
  28
  29 package org.apache.lucene.benchmark.byTask.feeds.demohtml;
  30
  31 import java.io.*;
  32 import java.util.Locale;
  33 import java.util.Properties;
  34
  35 public class HTMLParser {
  36   public static int SUMMARY_LENGTH = 200;
  37
  38   StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
  39   StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
  40   Properties metaTags=new Properties();
  41   String currentMetaTag=null;
  42   String currentMetaContent=null;
  43   int length = 0;
  44   boolean titleComplete = false;
  45   boolean inTitle = false;
  46   boolean inMetaTag = false;
  47   boolean inStyle = false;
  48   boolean afterTag = false;
  49   boolean afterSpace = false;
  50   String eol = System.getProperty("line.separator");
  51   Reader pipeIn = null;
  52   Writer pipeOut;
  53   private MyPipedInputStream pipeInStream = null;
  54   private PipedOutputStream pipeOutStream = null;
  55
  56   private class MyPipedInputStream extends PipedInputStream{
  57
  58     public MyPipedInputStream(){
  59       super();
  60     }
  61
  62     public MyPipedInputStream(PipedOutputStream src) throws IOException{
  63       super(src);
  64     }
  65
  66     public boolean full() throws IOException{
  67       return this.available() >= PipedInputStream.PIPE_SIZE;
  68     }
  69   }
  70
  71   public String getTitle() throws IOException, InterruptedException {
  72     if (pipeIn == null)
  73       getReader();                                // spawn parsing thread
  74     while (true) {
  75       synchronized(this) {
  76         if (titleComplete || pipeInStream.full())
  77           break;
  78         wait(10);
  79       }
  80     }
  81     return title.toString().trim();
  82   }
  83
  84   public Properties getMetaTags() throws IOException,
  85 InterruptedException {
  86     if (pipeIn == null)
  87       getReader();                                // spawn parsing thread
  88     while (true) {
  89       synchronized(this) {
  90         if (titleComplete || pipeInStream.full())
  91           break;
  92         wait(10);
  93       }
  94     }
  95     return metaTags;
  96   }
  97
  98
  99   public String getSummary() throws IOException, InterruptedException {
 100     if (pipeIn == null)
 101       getReader();                                // spawn parsing thread
 102     while (true) {
 103       synchronized(this) {
 104         if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
 105           break;
 106         wait(10);
 107       }
 108     }
 109     if (summary.length() > SUMMARY_LENGTH)
 110       summary.setLength(SUMMARY_LENGTH);
 111
 112     String sum = summary.toString().trim();
 113     String tit = getTitle();
 114     if (sum.equals(""))
 115       return tit;
 116     else
 117       return sum;
 118   }
 119
 120   public Reader getReader() throws IOException {
 121     if (pipeIn == null) {
 122       pipeInStream = new MyPipedInputStream();
 123       pipeOutStream = new PipedOutputStream(pipeInStream);
 124       pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
 125       pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
 126
 127       Thread thread = new ParserThread(this);
 128       thread.start();                             // start parsing
 129     }
 130
 131     return pipeIn;
 132   }
 133
 134   void addToSummary(String text) {
 135     if (summary.length() < SUMMARY_LENGTH) {
 136       summary.append(text);
 137       if (summary.length() >= SUMMARY_LENGTH) {
 138         synchronized(this) {
 139           notifyAll();
 140         }
 141       }
 142     }
 143   }
 144
 145   void addText(String text) throws IOException {
 146     if (inStyle)
 147       return;
 148     if (inTitle)
 149       title.append(text);
 150     else {
 151       addToSummary(text);
 152       if (!titleComplete && !(title.length() == 0)) {  // finished title
 153         synchronized(this) {
 154           titleComplete = true;                   // tell waiting threads
 155           notifyAll();
 156         }
 157       }
 158     }
 159
 160     length += text.length();
 161     pipeOut.write(text);
 162
 163     afterSpace = false;
 164   }
 165
 166   void addMetaTag() {
 167       metaTags.setProperty(currentMetaTag, currentMetaContent);
 168       currentMetaTag = null;
 169       currentMetaContent = null;
 170       return;
 171   }
 172
 173   void addSpace() throws IOException {
 174     if (!afterSpace) {
 175       if (inTitle)
 176         title.append(" ");
 177       else
 178         addToSummary(" ");
 179
 180       String space = afterTag ? eol : " ";
 181       length += space.length();
 182       pipeOut.write(space);
 183       afterSpace = true;
 184     }
 185   }
 186
 187 //    void handleException(Exception e) {
 188 //      System.out.println(e.toString());  // print the error message
 189 //      System.out.println("Skipping...");
 190 //      Token t;
 191 //      do {
 192 //        t = getNextToken();
 193 //      } while (t.kind != TagEnd);
 194 //    }
 195 }
 196
 197 PARSER_END(HTMLParser)
 198
 199
 200 void HTMLDocument() throws IOException :
 201 {
 202   Token t;
 203 }
 204 {
 205 //  try {
 206     ( Tag()         { afterTag = true; }
 207     | t=Decl()      { afterTag = true; }
 208     | CommentTag()  { afterTag = true; }
 209     | ScriptTag()  { afterTag = true; }
 210     | t=<Word>      { addText(t.image); afterTag = false; }
 211     | t=<Entity>    { addText(Entities.decode(t.image)); afterTag = false; }
 212     | t=<Punct>     { addText(t.image); afterTag = false; }
 213     | <Space>       { addSpace(); afterTag = false; }
 214     )* <EOF>
 215 //  } catch (ParseException e) {
 216 //    handleException(e);
 217 //  }
 218 }
 219
 220 void Tag() throws IOException :
 221 {
 222   Token t1, t2;
 223   boolean inImg = false;
 224 }
 225 {
 226   t1=<TagName> {
 227    String tagName = t1.image.toLowerCase(Locale.ENGLISH);
 228    if(Tags.WS_ELEMS.contains(tagName) ) {
 229       addSpace();
 230     }
 231     inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
 232     inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
 233     inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
 234     inImg = tagName.equalsIgnoreCase("<img");     // keep track if in <IMG>
 235   }
 236   (t1=<ArgName>
 237    (<ArgEquals>
 238     (t2=ArgValue()                                // save ALT text in IMG tag
 239      {
 240        if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
 241          addText("[" + t2.image + "]");
 242
 243         if(inMetaTag &&
 244                         (  t1.image.equalsIgnoreCase("name") ||
 245                            t1.image.equalsIgnoreCase("HTTP-EQUIV")
 246                         )
 247            && t2 != null)
 248         {
 249                 currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
 250                 if(currentMetaTag != null && currentMetaContent != null) {
 251                 addMetaTag();
 252                 }
 253         }
 254         if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
 255 null)
 256         {
 257                 currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
 258                 if(currentMetaTag != null && currentMetaContent != null) {
 259                 addMetaTag();
 260                 }
 261         }
 262      }
 263     )?
 264    )?
 265   )*
 266   <TagEnd>
 267 }
 268
 269 Token ArgValue() :
 270 {
 271   Token t = null;
 272 }
 273 {
 274   t=<ArgValue>                              { return t; }
 275 | LOOKAHEAD(2)
 276   <ArgQuote1> <CloseQuote1>                 { return t; }
 277 | <ArgQuote1> t=<Quote1Text> <CloseQuote1>  { return t; }
 278 | LOOKAHEAD(2)
 279   <ArgQuote2> <CloseQuote2>                 { return t; }
 280 | <ArgQuote2> t=<Quote2Text> <CloseQuote2>  { return t; }
 281 }
 282
 283
 284 Token Decl() :
 285 {
 286   Token t;
 287 }
 288 {
 289   t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd>
 290   { return t; }
 291 }
 292
 293
 294 void CommentTag() :
 295 {}
 296 {
 297   (<Comment1> ( <CommentText1> )* <CommentEnd1>)
 298  |
 299   (<Comment2> ( <CommentText2> )* <CommentEnd2>)
 300 }
 301
 302 void ScriptTag() :
 303 {}
 304 {
 305   <ScriptStart> ( <ScriptText> )* <ScriptEnd>
 306 }
 307
 308
 309 TOKEN :
 310 {
 311   < ScriptStart: "<script" > : WithinScript
 312 | < TagName:  "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
 313 | < DeclName: "<"  "!"   ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
 314
 315 | < Comment1:  "<!--" > : WithinComment1
 316 | < Comment2:  "<!" >   : WithinComment2
 317
 318 | < Word:     ( <LET> | <LET> (["+","/"])+ | <NUM> ["\""] |
 319                 <LET> ["-","'"] <LET> | ("$")? <NUM> [",","."] <NUM> )+ >
 320 | < #LET:     ["A"-"Z","a"-"z","0"-"9"] >
 321 | < #NUM:     ["0"-"9"] >
 322 | < #HEX:     ["0"-"9","A"-"F","a"-"f"] >
 323
 324 | < Entity:   ( "&" (["A"-"Z","a"-"z"])+ (";")? | "&" "#" (<NUM>)+ (";")? | "&" "#" ["X","x"] (<HEX>)+ (";")? ) >
 325
 326 | < Space:    (<SP>)+ >
 327 | < #SP:      [" ","\t","\r","\n"] >
 328
 329 | < Punct:    ~[] > // Keep this last.  It is a catch-all.
 330 }
 331
 332 <WithinScript> TOKEN:
 333 {
 334   < ScriptText:  (~["<",">"])+ | "<" | ">" >
 335 | < ScriptEnd: "</script" (~["<",">"])* ">" > : DEFAULT
 336 }
 337
 338 <WithinTag> TOKEN:
 339 {
 340   < ArgName:   (~[" ","\t","\r","\n","=",">","'","\""])
 341                (~[" ","\t","\r","\n","=",">"])* >
 342 | < ArgEquals: "=" >  : AfterEquals
 343 | < TagEnd:    ">" | "=>" >  : DEFAULT
 344 }
 345
 346 <AfterEquals> TOKEN:
 347 {
 348   < ArgValue:  (~[" ","\t","\r","\n","=",">","'","\""])
 349                (~[" ","\t","\r","\n",">"])* > : WithinTag
 350 }
 351
 352 <WithinTag, AfterEquals> TOKEN:
 353 {
 354   < ArgQuote1: "'"  > : WithinQuote1
 355 | < ArgQuote2: "\"" > : WithinQuote2
 356 }
 357
 358 <WithinTag, AfterEquals> SKIP:
 359 {
 360   < <Space> >
 361 }
 362
 363 <WithinQuote1> TOKEN:
 364 {
 365   < Quote1Text:  (~["'"])+ >
 366 | < CloseQuote1: <ArgQuote1> > : WithinTag
 367 }
 368
 369 <WithinQuote2> TOKEN:
 370 {
 371   < Quote2Text:  (~["\""])+ >
 372 | < CloseQuote2: <ArgQuote2> > : WithinTag
 373 }
 374
 375
 376 <WithinComment1> TOKEN :
 377 {
 378   < CommentText1:  (~["-"])+ | "-" >
 379 | < CommentEnd1:   "-->" > : DEFAULT
 380 }
 381
 382 <WithinComment2> TOKEN :
 383 {
 384   < CommentText2:  (~[">"])+ >
 385 | < CommentEnd2:   ">" > : DEFAULT
 386 }